AbstractTokenizer


1   /*
2    * AbstractTokenizer.java: base class for Tokenizer implementations.
3    *
4    * Copyright (C) 2004 Heiko Blau
5    *
6    * This file belongs to the JTopas Library.
7    * JTopas is free software; you can redistribute it and/or modify it 
8    * under the terms of the GNU Lesser General Public License as published by the 
9    * Free Software Foundation; either version 2.1 of the License, or (at your 
10   * option) any later version.
11   *
12   * This software is distributed in the hope that it will be useful, but WITHOUT
13   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
14   * FITNESS FOR A PARTICULAR PURPOSE. 
15   * See the GNU Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public License along
18   * with JTopas. If not, write to the
19   *
20   *   Free Software Foundation, Inc.
21   *   59 Temple Place, Suite 330, 
22   *   Boston, MA 02111-1307 
23   *   USA
24   *
25   * or check the Internet: http://www.fsf.org
26   *
27   * Contact:
28   *   email: heiko@susebox.de 
29   */
30  
31  package de.susebox.jtopas;
32  
33  //-----------------------------------------------------------------------------
34  // Imports
35  //
36  import java.io.Reader  ;
37  import java.util.SortedMap  ;
38  import java.util.TreeMap  ;
39  import java.util.LinkedList  ;
40  import java.util.Arrays  ;
41  
42  import de.susebox.java.lang.ExtIndexOutOfBoundsException;
43  
44  import de.susebox.jtopas.spi.WhitespaceHandler;
45  import de.susebox.jtopas.spi.KeywordHandler;
46  import de.susebox.jtopas.spi.PatternHandler;
47  import de.susebox.jtopas.spi.SeparatorHandler;
48  import de.susebox.jtopas.spi.SequenceHandler;
49  
50  import de.susebox.jtopas.spi.StandardWhitespaceHandler;
51  import de.susebox.jtopas.spi.StandardKeywordHandler;
52  import de.susebox.jtopas.spi.StandardSeparatorHandler;
53  import de.susebox.jtopas.spi.StandardSequenceHandler;
54  
55  import de.susebox.jtopas.spi.DataProvider;
56  import de.susebox.jtopas.spi.DataMapper;
57  
58  
59  //-----------------------------------------------------------------------------
60  // Class AbstractTokenizer
61  //
62  
63  /**<p>
64   * Base class for {@link Tokenizer} implementations. <code>AbstractTokenizer</code>
65   * separates the data analysis from the actual data provision. Although the class
66   * maintains read and write positions the physical representation of the logical 
67   * character buffer behind these positions concerns only the subclasses.
68   *</p>
69   *
70   * @see Tokenizer
71   * @see TokenizerProperties
72   * @author Heiko Blau
73   */
74  public abstract class AbstractTokenizer implements Tokenizer, TokenizerPropertyListener {
75  
76    //---------------------------------------------------------------------------
77    // Abstract methods
78    //
79  
80    /**
81     * Subclasses have to provide {@link de.susebox.jtopas.spi.DataProvider}
82     * instances for various token type handlers. The given start position is the
83     * absolute number of characters from the beginning of the data source.
84     *
85     * @param   startPos    position in the input data
86     * @param   length      number of characters
87     * @return  the <code>DataProvider</code> for the given data range
88     */
89    protected abstract DataProvider getDataProvider(int startPos, int length);
90  
91    /**
92     * This method is called when the tokenizer runs out of data. Its main purpose 
93     * is to call the {@link TokenizerSource#read} method. It is also responsible
94     * to handle the flag {@link TokenizerProperties#F_KEEP_DATA} flag).
95     *
96     * @return  number of read bytes or -1 if an end-of-file condition occured
97     * @throws  TokenizerException wrapped exceptions from the {@link TokenizerSource#read} 
98     *          method
99     */
100   protected abstract int readMoreData() throws TokenizerException;
101   
102   
103   //---------------------------------------------------------------------------
104   // Constructors
105   //
106   
107   /**
108    * Default constructor that sets the tokenizer control flags as it would be
109    * approbriate for C/C++ and Java. Found token images are copied. No line nor
110    * column informations are provided. Nested comments are not allowed.
111    *<br>
112    * The tokenizer will use the {@link TokenizerProperties#DEFAULT_WHITESPACES} 
113    * and {@link TokenizerProperties#DEFAULT_SEPARATORS} for whitespace and 
114    * separator handling.
115    */  
116   public AbstractTokenizer() {
117     _baseTokenizer = this;
118     if (_defaultProperties == null) {
119       _defaultProperties = new StandardTokenizerProperties();
120     }
121     setTokenizerProperties(_defaultProperties);
122   }
123   
124   /**
125    * Contructing a <code>AbstractTokenizer</code> with a backing {@link TokenizerProperties}
126    * instance.
127    *
128    * @param properties  an {@link TokenizerProperties} object containing the 
129    *                    settings for the tokenizing process
130    */
131   public AbstractTokenizer(TokenizerProperties properties) {
132     _baseTokenizer = this;
133     setTokenizerProperties(properties);
134   }
135 
136   
137   //---------------------------------------------------------------------------
138   // data source
139   //
140   
141   /**
142    * Setting the source of data. This method is usually called during setup of
143    * the <code>Tokenizer</code> but may also be invoked while the tokenizing
144    * is in progress. It will reset the tokenizers input buffer, line and column 
145    * counters etc.
146    *<br>
147    * Subclasses should override this method to do their own actions on a data source
148    * change. Generally, this base method should be called first in the subclass
149    * implementation of <code>setSource</code> (equivalent to super calls in
150    * constructors of derived classes).
151    *
152    * @param source  a {@link TokenizerSource} to read data from
153    * @see #getSource
154    */
155   public void setSource(TokenizerSource source) {
156     _source           = source;
157     _eofReached       = false;
158     _currentReadPos   = 0;
159     _currentWritePos  = 0;
160     if (isFlagSet(Flags.F_COUNT_LINES)) {
161       _lineNumber     = 0;
162       _columnNumber   = 0;
163     } else {
164       _lineNumber     = -1;
165       _columnNumber   = -1;
166     }
167     Arrays.fill(_scannedToken, null);
168   }
169   
170   /**
171    * Convenience method to avoid the construction of a {@link TokenizerSource}
172    * from the most important data source {@link java.io.Reader}.
173    *
174    * @param reader  the {@link java.io.Reader} to get data from
175    */
176   public void setSource(Reader   reader) {
177     setSource(new ReaderSource(reader));
178   }
179    
180   /**
181    * Retrieving the {@link TokenizerSource} of this <code>Tokenizer</code>. The
182    * method may return <code>null</code> if there is no <code>TokenizerSource</code>
183    * associated with it.
184    *
185    * @param the {@link TokenizerSource} associated with this <code>Tokenizer</code>
186    * @see #setSource
187    */
188   public TokenizerSource getSource() {
189     return _source;
190   }
191   
192   
193   //---------------------------------------------------------------------------
194   // Methods of the Tokenizer interface
195   //
196   
197   /**
198    * Setting the tokenizer characteristics. See the method description in 
199    * {@link Tokenizer}.
200    *
201    * @param   props   the {@link TokenizerProperties} for this tokenizer
202    * @throws  NullPointerException if the <code>null</code> is passed to the call
203    * @see     #getTokenizerProperties
204    */
205   public void setTokenizerProperties(TokenizerProperties props) throws NullPointerException   {
206     if (props == null) {
207       throw new NullPointerException  ();
208     }
209 
210     // set properties
211     if (_properties != null) {
212       _properties.removeTokenizerPropertyListener(this);
213     }
214     _properties = props;
215     _properties.addTokenizerPropertyListener(this);
216 
217     // who is going to handle the various token types ?
218     if (_properties instanceof WhitespaceHandler) {
219       setWhitespaceHandler((WhitespaceHandler)_properties);
220     } else {
221       setWhitespaceHandler(new StandardWhitespaceHandler(_properties));
222     }
223     if (_properties instanceof SeparatorHandler) {
224       setSeparatorHandler((SeparatorHandler)_properties);
225     } else {
226       setSeparatorHandler(new StandardSeparatorHandler(_properties));
227     }
228     if (_properties instanceof SequenceHandler) {
229       setSequenceHandler((SequenceHandler)_properties);
230     } else {
231       setSequenceHandler(new StandardSequenceHandler(_properties));
232     }
233     if (props instanceof KeywordHandler) {
234       setKeywordHandler((KeywordHandler)props);
235     } else {
236       setKeywordHandler(new StandardKeywordHandler(_properties));
237     }
238     if (_properties instanceof PatternHandler) {
239       setPatternHandler((PatternHandler)_properties);
240     } else {
241       setPatternHandler(null);
242     }
243       
244     // flag handling
245     int newFlags = _properties.getParseFlags();
246 
247     if (newFlags != _flags) {
248       propertyChanged(new TokenizerPropertyEvent(
249                             TokenizerPropertyEvent.PROPERTY_MODIFIED,
250                             new TokenizerProperty(TokenizerProperty.PARSE_FLAG_MASK, 
251                                                   new String  [] { Integer.toBinaryString(newFlags) } ),
252                             new TokenizerProperty(TokenizerProperty.PARSE_FLAG_MASK, 
253                                                   new String  [] { Integer.toBinaryString(_flags) } )));
254     }
255   }
256 
257   /**
258    * Retrieving the current tokenizer characteristics. See the method description 
259      * in {@link Tokenizer}.
260    *
261    * @return  the {@link TokenizerProperties} of this <code>Tokenizer</code>
262    * @see     #setTokenizerProperties
263    */
264   public TokenizerProperties getTokenizerProperties() {
265     return _properties;
266   }
267   
268   /**
269    * Setting the control flags of the <code>Tokenizer</code>. See the method 
270    * description in {@link Tokenizer}.
271    *
272    * @param flags the parser control flags
273    * @param mask  the mask for the flags to set or unset
274    * @throws TokenizerException if one or more of the flags given cannot be honored
275    * @see   #getParseFlags
276    */
277   public void changeParseFlags(int flags, int mask) throws TokenizerException {
278     // test the given flags
279     if ((mask | VALID_FLAGS_MASK) != VALID_FLAGS_MASK) {
280       throw new TokenizerException(
281                   "One or more flags cannot be set separately for a {0}. Violating flags in {1}: {2}.",
282                   new Object  [] { AbstractTokenizer.class.getName(), 
283                                  Integer.toHexString(flags),
284                                  Integer.toHexString(mask & ~VALID_FLAGS_MASK) } );
285     }
286     
287     // set the new flags for this tokenizer
288     _flagMask = mask;
289     _flags    = (flags & mask) | (getTokenizerProperties().getParseFlags() & ~mask);
290 
291     // when counting lines initialize the current line and column position
292     if ( ! isFlagSet(Flags.F_COUNT_LINES)) {
293       _lineNumber   = 0;
294       _columnNumber = 0;
295     }
296   }
297 
298   /**
299    * Retrieving the parser control flags. See the method description in 
300    * {@link Tokenizer}.
301    *
302    * @return the current parser control flags
303    * @see #changeParseFlags
304    */
305   public int getParseFlags() {
306     return (getTokenizerProperties().getParseFlags() & ~_flagMask) + (_flags & _flagMask);
307   }
308   
309   /**
310    * Setting a new {@link de.susebox.jtopas.spi.KeywordHandler} or removing any 
311    * previously installed one. See the method description in {@link Tokenizer}.
312    *
313    * @param handler the (new) {@link KeywordHandler} to use or <code>null</code>
314    *                to remove it
315    */
316   public void setKeywordHandler(de.susebox.jtopas.spi.KeywordHandler handler) {
317     synchronized(this) {
318       if (handler == _properties) {
319         if (_properties != null && _properties.getKeywords().hasNext()) {
320           _keywordHandler = handler;
321         } else {
322           _keywordHandler = null;
323         }
324         _internalFlags &= ~IFLAG_EXTERNAL_KEYWORD_HANDLER;
325       } else {
326         _keywordHandler = handler;
327         _internalFlags |= IFLAG_EXTERNAL_KEYWORD_HANDLER;
328       }
329     }
330   }
331   
332   /**
333    * Retrieving the current {@link de.susebox.jtopas.spi.KeywordHandler}. See the 
334    * method description in {@link Tokenizer}.
335    *
336    * @return the currently active whitespace keyword or <code>null</code>, if 
337    *         keyword support is switched off
338    */
339   public de.susebox.jtopas.spi.KeywordHandler getKeywordHandler() {
340     synchronized(this) {
341       if ((_internalFlags & IFLAG_EXTERNAL_KEYWORD_HANDLER) == 0) {
342         return (de.susebox.jtopas.spi.KeywordHandler)getTokenizerProperties();
343       } else {
344         return _keywordHandler;
345       }
346     }
347   }
348   
349   /**
350    * Setting a new {@link de.susebox.jtopas.spi.WhitespaceHandler} or removing any 
351    * previously installed one. See the method description in {@link Tokenizer}.
352    *
353    * @param handler the (new) whitespace handler to use or <code>null</code> to 
354    *                switch off whitespace handling
355    * @see   #getWhitespaceHandler
356    */
357   public void setWhitespaceHandler(de.susebox.jtopas.spi.WhitespaceHandler handler) {
358     _whitespaceHandler = handler;
359   }
360   
361   /**
362    * Retrieving the current {@link de.susebox.jtopas.spi.WhitespaceHandler}. See 
363    * the method description in {@link Tokenizer}.
364    *
365    * @return  the currently active whitespace handler or null, if the base
366    *          implementation is working
367    */
368   public de.susebox.jtopas.spi.WhitespaceHandler getWhitespaceHandler() {
369     return _whitespaceHandler;
370   }
371   
372   /**
373    * Setting a new {@link de.susebox.jtopas.spi.SeparatorHandler} or removing any 
374    * previously installed <code>SeparatorHandler</code>. See the method description 
375    * in {@link Tokenizer}.
376    *
377    * @param handler the (new) separator handler to use or <code>null</code> to
378    *                remove it
379    * @see   #getSeparatorHandler
380    */
381   public void setSeparatorHandler(de.susebox.jtopas.spi.SeparatorHandler handler) {
382     _separatorHandler = handler;
383   }
384   
385   /**
386    * Retrieving the current {@link de.susebox.jtopas.spi.SeparatorHandler}. See 
387    * the method description in {@link Tokenizer}.
388    *
389    * @return  the currently active {@link SeparatorHandler} or <code>null</code>, 
390    *          if separators aren't recognized by the tokenizer
391    * @see     #setSequenceHandler
392    */
393   public de.susebox.jtopas.spi.SeparatorHandler getSeparatorHandler() {
394     return _separatorHandler;
395   }
396   
397   /**
398    * Setting a new {@link de.susebox.jtopas.spi.SequenceHandler} or removing any 
399    * previously installed one. See the method description in {@link Tokenizer}.
400    *
401    * @param handler   the (new) {@link SequenceHandler} to use or null to remove it
402    */
403   public void setSequenceHandler(de.susebox.jtopas.spi.SequenceHandler handler) {
404     synchronized(this) {
405       if (handler == _properties) {
406         if (_properties != null && (   _properties.getSpecialSequences().hasNext()
407                                     || _properties.getStrings().hasNext()
408                                     || _properties.getBlockComments().hasNext()
409                                     || _properties.getLineComments().hasNext())) {
410           _sequenceHandler = handler;
411         } else {
412           _sequenceHandler = null;
413         }
414         _internalFlags &= ~IFLAG_EXTERNAL_SEQUENCE_HANDLER;
415       } else {
416         _sequenceHandler = handler;
417         _internalFlags |= IFLAG_EXTERNAL_SEQUENCE_HANDLER;
418       }
419     }
420   }
421   
422   /**
423    * Retrieving the current {@link SequenceHandler}. See the method description 
424    * in {@link Tokenizer}.
425    *
426    * @return  the currently active {@link SequenceHandler} or null, if the base
427    *          implementation is working
428    */
429   public de.susebox.jtopas.spi.SequenceHandler getSequenceHandler() {
430     synchronized(this) {
431       if ((_internalFlags & IFLAG_EXTERNAL_SEQUENCE_HANDLER) == 0) {
432         return (de.susebox.jtopas.spi.SequenceHandler)getTokenizerProperties();
433       } else {
434         return _sequenceHandler;
435       }
436     }
437   }
438   
439   /**
440    * Setting a new {@link de.susebox.jtopas.spi.PatternHandler} or removing any 
441    * previously installed one. See the method description in {@link Tokenizer}.
442    *
443    * @param handler the (new) {@link de.susebox.jtopas.spi.PatternHandler} to 
444    *                use or <code>null</code> to remove it
445    * @see #getPatternHandler
446    */
447   public void setPatternHandler(de.susebox.jtopas.spi.PatternHandler handler) {
448     synchronized(this) {
449       if (handler == _properties) {
450         if (_properties != null && _properties.getPatterns().hasNext()) {
451           _patternHandler = handler;
452         } else {
453           _patternHandler = null;
454         }
455         _internalFlags &= ~IFLAG_EXTERNAL_PATTERN_HANDLER;
456       } else {
457         _patternHandler = handler;
458         _internalFlags |= IFLAG_EXTERNAL_PATTERN_HANDLER;
459       }
460     }
461   }
462   
463   /**
464    * Retrieving the current {@link de.susebox.jtopas.spi.PatternHandler}. See the
465    * method description in {@link Tokenizer}.
466    *
467    * @return  the currently active {@link de.susebox.jtopas.spi.PatternHandler} 
468    *          or <code>null</code>, if patterns are not recognized by the tokenizer
469    * @see #setPatternHandler
470    */
471   public de.susebox.jtopas.spi.PatternHandler getPatternHandler() {
472     synchronized(this) {
473       if ((_internalFlags & IFLAG_EXTERNAL_PATTERN_HANDLER) == 0) {
474         return (de.susebox.jtopas.spi.PatternHandler)getTokenizerProperties();
475       } else {
476         return _patternHandler;
477       }
478     }
479   }
480   
481   /**
482    * Query the current row. The method can only be used if the flag {@link TokenizerProperties#F_COUNT_LINES}
483    * has been set. Without this flag being set, the return value is undefined.
484    *<br>
485    * Note that row counting starts with 0, while editors often use 1 for the first
486    * row.
487    *
488    * @return current row (starting with 0) 
489    *         or -1 if the flag {@link TokenizerProperties#F_COUNT_LINES} is set
490    */
491   public int getCurrentLine() {
492     return _lineNumber;
493   }
494   
495   /**
496    * Retrieve the current column. The method can only be used if the flag <code>F_COUNT_LINES</code>
497    * has been set.
498    * Without this flag being set, the return value is undefined.
499    * Note that column counting starts with 0, while editors often use 1 for the first
500    * column in one row.
501    *
502    * @return current column number (starting with 0)
503    */
504   public int getCurrentColumn() {
505     return _columnNumber;
506   }
507   
508   /**
509    * Checking if there are more tokens available. See the method description in 
510    * {@link Tokenizer}.
511    *
512    * @return  <code>true</code> if a ca_ll to {@link #nextToken} or {@link #nextImage}
513    *          will succed, <code>false</code> otherwise
514    */
515   public boolean hasMoreToken() {
516     return _scannedToken[0] == null || _scannedToken[0].getType() != Token.EOF;
517   }
518   
519   /**
520    * Retrieving the next {@link Token}. See the method description in 
521    * {@link Tokenizer}.
522    *
523    * @return found {@link Token} including the EOF token
524    * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
525    *         (IOExceptions for instance)
526    */
527   public Token nextToken() throws TokenizerException {
528     boolean returnIt = false;
529     
530     // Get the next token
531 __MAIN_LOOP__:
532     do {
533       // analyze look-ahead token
534       if (_scannedToken[1] == null) {
535         if ( ! isEOF(0)) {
536           if ( ! isWhitespace(0)) {
537             if ( ! isPattern(0, false)) {
538               if ( ! isSpecialSequence(0)) {
539                 if ( ! isSeparator(0)) {
540                   _scannedToken[1] = new Token(Token.NORMAL);
541                 }
542               }
543             }
544           }
545         }
546       }
547       _scannedToken[0] = _scannedToken[1];
548       _scannedToken[1] = _scannedToken[2];
549       _scannedToken[2] = null;
550       
551       // get new token or complete the previously found look-ahead token
552       Token             token = _scannedToken[0];
553       TokenizerProperty prop  = (TokenizerProperty)token.getCompanion();
554       
555       token.setCompanion((prop != null) ? prop.getCompanion() : null);
556       token.setStartPosition(getReadPosition());
557       token.setStartLine(_lineNumber);
558       token.setStartColumn(_columnNumber);
559 
560       returnIt = true;
561       
562       switch (token.getType()) {
563       case Token.EOF:
564         token.setLength(0);
565         break;
566       case Token.WHITESPACE:
567         token.setLength(completeWhitespace());
568         returnIt = isFlagSet(Flags.F_RETURN_SIMPLE_WHITESPACES);
569         break;
570       case Token.SEPARATOR:     // Separators are always single characters.
571         token.setLength(1);
572         break;
573       case Token.STRING:
574         token.setLength(completeString(prop));
575         break;
576       case Token.LINE_COMMENT:
577         token.setLength(completeLineComment(prop));
578         returnIt = isFlagSet(prop, Flags.F_RETURN_LINE_COMMENTS);
579         break;
580       case Token.BLOCK_COMMENT:
581         token.setLength(completeBlockComment(prop));
582         returnIt = isFlagSet(prop, Flags.F_RETURN_BLOCK_COMMENTS);
583         break;
584       case Token.SPECIAL_SEQUENCE:
585         token.setLength(prop.getImages()[0].length());
586         break;
587       case Token.PATTERN:
588         // already contained in the first look-ahead token, see token shifting
589         break;
590       default:
591         prop = completeBoundedToken(token);
592       }
593       
594       // compute new line and column positions (if flag is set) and complete
595       // the token
596       adjustLineAndColumn(token.getType(), token.getLength());
597       token.setEndLine(_lineNumber);
598       token.setEndColumn(_columnNumber);
599 
600       // need to extract the image ?
601       if (returnIt) {
602         boolean tokenPosOnly     = (prop != null) ? isFlagSet(prop, Flags.F_TOKEN_POS_ONLY) : 
603                                                     isFlagSet(Flags.F_TOKEN_POS_ONLY);
604         boolean returnImageParts = (prop != null) ? isFlagSet(prop, Flags.F_RETURN_IMAGE_PARTS) :
605                                                     isFlagSet(Flags.F_RETURN_IMAGE_PARTS);
606         if ( ! tokenPosOnly || returnImageParts) {    
607           token.setImage(getText(_currentReadPos, token.getLength()));
608         }
609         if (returnImageParts) {
610           switch (token.getType()) {
611           case Token.WHITESPACE:
612             token.setImageParts(splitIntoLines(token.getImage()));
613             break;
614           case Token.STRING:
615             token.setImageParts(splitString(prop, token.getImage()));
616             break;
617           case Token.LINE_COMMENT:
618             token.setImageParts(splitIntoLines(token.getImage().substring(prop.getImages()[0].length())));
619             break;
620           case Token.BLOCK_COMMENT:
621             token.setImageParts(splitBlockComment(prop, token.getImage()));
622             break;
623           case Token.PATTERN:
624             break;
625           case Token.EOF:
626             token.setImageParts(new String  [] {} );
627             break;
628           default:
629             token.setImageParts(new String  [] { token.getImage() } );
630           }
631         }
632       }
633 
634       // this is the one and only point where the current read position is
635       // adjusted (except for the data shifting in readMoreData).
636       _currentReadPos += token.getLength();
637     
638     } while ( ! returnIt);
639 
640     // the current token is the first in the list
641     return _scannedToken[0];
642   }
643   
644   /**
645    * This method is a convenience method. It returns only the next token image
646    * without any informations about its type or associated information. See the 
647    * method description in {@link Tokenizer}.
648    *
649    * @return the token image of the next token
650    * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
651    *         (IOExceptions for instance)
652    * @see #currentImage
653    */
654   public String   nextImage() throws TokenizerException {
655     nextToken();
656     return currentImage();
657   }
658   
659   /**
660    * Retrieve the {@link Token} that was found by the last call to {@link #nextToken}.
661    * See the method description in {@link Tokenizer}.
662    *
663    * @return  the {@link Token} retrieved by the lahasest call to {@link #nextToken}.
664    * @throws  TokenizerException if the tokenizer has no current token
665    */
666   public Token currentToken() throws TokenizerException {
667     if (_scannedToken[0] == null) {
668       throw new TokenizerException("No current token available (nextToken was not called / read position changed)");
669     }
670     return _scannedToken[0];
671   }
672  
673   /**
674    * Convenience method to retrieve only the token image of the {@link Token} that
675    * would be returned by {@link #currentToken}. See the method description in 
676    * {@link Tokenizer}.
677    *
678    * @return the token image of the current token
679    * @see #currentToken
680    */
681   public String   currentImage() throws TokenizerException {
682     Token token = currentToken();
683     
684     if (token.getType() == Token.EOF) {
685       return null;
686     } else if ( ! isFlagSet(Flags.F_TOKEN_POS_ONLY) || token.getImage() != null) {
687       return token.getImage();
688     } else {
689       return getText(token.getStartPosition(), token.getLength());
690     }
691   }
692   
693   /**
694    * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method will 
695    * return the line number starting with 0 in the input stream. See the method 
696    * description in {@link Tokenizer}.
697    *
698    * @return the current line number starting with 0 or -1 if no line numbers are supplied.
699    * @see #getColumnNumber
700    */  
701   public int getLineNumber() {
702     return _lineNumber;
703   }
704   
705   /**
706    * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method will 
707    * return the current column positionstarting with 0 in the input stream. See 
708    * the method description in {@link Tokenizer}.
709    *
710    * @return the current column position
711    * @see #getLineNumber
712    */  
713   public int getColumnNumber() {
714     return _columnNumber;
715   }
716   
717   /**
718    * Getting the current read offset. See the method description in 
719    * {@link Tokenizer}.
720    *
721    * @return the absolute offset in characters from the start of the data source 
722    *         of the Tokenizer where reading will be continued
723    * @see #setReadPositionAbsolute
724    * @see #setReadPositionRelative
725    */
726   public int getReadPosition() {
727     return _currentReadPos;
728   }
729   
730   /**
731    * Retrieving the number of the currently available characters. See the method 
732    * description in {@link Tokenizer}.
733    *
734    * @return number of currently available characters
735    */
736   public int currentlyAvailable() {
737     return _currentWritePos - getRangeStart();
738   }
739   
740   /**
741    * Try to read more data into the text buffer of the tokenizer. See the method 
742    * description in {@link Tokenizer}.
743    *
744    * @return  the number of character now available
745    * @throws  TokenizerException generic exception (list) for all problems that 
746    *          may occur while reading (IOExceptions for instance)
747    */
748   public int readMore() throws TokenizerException {
749     readMoreDataFromBase();
750     return currentlyAvailable();
751   }
752 
753   /**
754    * Returns the character at the given position. The method does not attempt to
755    * read more data.
756    *
757    * @param   pos   get character on this position in the data stream
758    * @return  the character at the given position
759    * @throws  IndexOutOfBoundsException if the parameter <code>pos</code> is not 
760    *          in the available text range (text window)
761    */
762   public char getChar(int pos) throws IndexOutOfBoundsException   {
763     return getBaseDataProvider(pos, 1).getCharAt(0);
764   }
765   
766   /**
767    * Retrieve text from the currently available range. See the method description 
768    * in {@link Tokenizer}.
769    *
770    * @param   start   position where the text begins
771    * @param   len     length of the text
772    * @return  the text beginning at the given position ith the given length
773    * @throws  IndexOutOfBoundsException if the starting position or the length 
774    *          is out of the current text window
775    */
776   public String   getText(int start, int len) throws IndexOutOfBoundsException   {
777     return getBaseDataProvider(start, len).toString();
778   }
779   
780   /**
781    * This method sets the tokenizers current read position to the given absolute
782    * read position. See the method description in {@link Tokenizer}.
783    *<br>
784    * When using this method with embedded tokenizers, the user is responsible to
785    * set the read position in the currently used tokenizer. It will be propagated
786    * by the next call to {@link #switchTo}. Until that point, a call to this
787    * method has no effect on the other tokenizers sharing the same data source.
788    *
789    * @param   position  absolute position for the next parse operation
790    * @throws  IndexOutOfBoundsException if the parameter <code>position</code> is
791    *          not in the available text range (text window)
792    * @see     #setReadPositionRelative
793    */
794   public void setReadPositionAbsolute(int position) throws IndexOutOfBoundsException   {
795     if (position < getRangeStart()) {
796       throw new ExtIndexOutOfBoundsException(
797                   "Invalid read position {0} below the current text window start {1}.", 
798                   new Object  [] { new Integer  (position), new Integer  (getRangeStart()) } 
799                 );
800     } else if (position > _currentWritePos) {
801       throw new ExtIndexOutOfBoundsException(
802                   "Invalid read position {0} at or above the current text window end {1}.", 
803                   new Object  [] { new Integer  (position), new Integer  (currentlyAvailable() + getRangeStart()) }
804                 );
805     }
806     _currentReadPos = position;
807     Arrays.fill(_scannedToken, null);
808     
809     // adjust line and column counting
810     if (isFlagSet(Flags.F_COUNT_LINES)) {
811       SortedMap   map         = _position2LineMap.headMap(new Integer  (position + 1));
812       
813       if (map != null &&  ! map.isEmpty()) {
814         Integer   lastLineStart = (Integer  )map.lastKey();
815         
816         _lineNumber   = ((Integer  )map.get(lastLineStart)).intValue();
817         _columnNumber = position - lastLineStart.intValue();
818       } else {
819         _lineNumber   = 0;
820         _columnNumber = position;
821       }
822     }
823   }  
824 
825   /**
826    * This method sets the tokenizers new read position the given number of characters
827    * forward (positive value) or backward (negative value) starting from the current
828    * read position. See the method description in {@link Tokenizer}.
829    *<br>
830    * When using this method with embedded tokenizers, the user is responsible to
831    * set the read position in the currently used tokenizer. It will be propagated
832    * by the next call to {@link #switchTo}. Until that point, a call to this
833    * method has no effect on the other tokenizers sharing the same data source.
834    *
835    * @param   offset  number of characters to move forward (positive offset) or
836    *                 backward (negative offset)
837    * @throws  IndexOutOfBoundsException if the parameter <code>offset</code> would
838    *          move the read position out of the available text range (text window)
839    * @see     #setReadPositionAbsolute
840    */
841   public void setReadPositionRelative(int offset) throws IndexOutOfBoundsException   {
842     setReadPositionAbsolute(getReadPosition() + offset);
843   }
844 
845   /**
846    * Closing this tokenizer frees resources and deregisters from the 
847    * associated {@link TokenizerProperties} object.
848    */
849   public void close() {
850     // deregister from the properties
851     if (_properties != null) {
852       _properties.removeTokenizerPropertyListener(this);
853       _properties = null;
854     }
855 
856     // freeing memory
857     if (_position2LineMap != null) {
858       _position2LineMap.clear();
859       _position2LineMap = null;
860     }
861     
862     // adjust members
863     _eofReached         = true;
864     _flags              = 0;
865     _flagMask           = 0;
866     _internalFlags      = 0;
867     _currentReadPos     = 0;
868     _currentWritePos    = 0;
869     _lineNumber         = -1;
870     _columnNumber       = -1;
871     _nextTokenizer      = null;
872     _prevTokenizer      = null;
873     _whitespaceHandler  = null;
874     _separatorHandler   = null;
875     _keywordHandler     = null;
876     _sequenceHandler    = null;
877     _patternHandler     = null;
878     _source             = null;
879     Arrays.fill(_scannedToken, null);
880   }
881 
882   
883   //---------------------------------------------------------------------------
884   // embedded tokenizer support
885   //
886   
887   /**
888    * Adding an embedded tokenizer. Embedded tokenizer work on the same input 
889    * buffer as their base tokenizer. A situation where embedded tokenizer could
890    * be applied, is a HTML stream with cascading style sheet (CSS) and JavaScript
891    * parts.
892    *<br>
893    * There are no internal means of switching from one tokenizer to another. 
894    * This should be done by the caller using the method {@link #switchTo}.
895    *<br>
896    * The {@link TokenizerProperties#F_KEEP_DATA} and {@link TokenizerProperties#F_COUNT_LINES}
897    * flags of the base tokenizer take effect also in the embedded tokenizers.
898    *<br>
899    * Since is might be possible that the given <code>tokenizer</code> is a
900    * derivation of the <code>AbstractTokenizer</code> class, this method is
901    * synchronized on <code>tokenizer</code>.
902    *
903    * @param  tokenizer   an embedded tokenizer
904    * @throws TokenizerException if something goes wrong (not likely :-)
905    */
906   public void addTokenizer(AbstractTokenizer tokenizer) throws TokenizerException {
907     AbstractTokenizer curr = this;
908     
909     while (curr._nextTokenizer != null) {
910       curr = curr._nextTokenizer;
911     }
912     
913     if (tokenizer != null) {
914       synchronized(tokenizer) {
915         curr._nextTokenizer      = tokenizer;
916         tokenizer._prevTokenizer = curr;
917 
918         // share the input buffer of the base tokenizer
919         AbstractTokenizer baseTokenizer = getBaseTokenizer();
920         
921         tokenizer._baseTokenizer = baseTokenizer;
922         
923         // inherited flags
924         tokenizer.changeParseFlags(baseTokenizer.getParseFlags(), Flags.F_COUNT_LINES);
925       }
926     }
927   }
928 
929   /**
930    * Changing fron one tokenizer to another. If the given tokenizer has not been
931    * added with {@link #addTokenizer}, an exception is thrown.<br>
932    * The <code>switchTo</code> method does the nessecary synchronisation between
933    * <code>this</code> and the given tokenizer. The user is therefore responsible
934    * to use <code>switchTo</code> whenever a tokenizer change is nessecary. It
935    * must be done this way:
936    *<blockquote><pre>
937    *   Tokenizer base     = new MyTokenizer(...)
938    *   Tokenizer embedded = new MyTokenizer(...)
939    *
940    *   // setting properties (comments, keywords etc.)
941    *   ...
942    *
943    *   // embedding a tokenizer
944    *   base.addTokenizer(embedded);
945    *   
946    *   // tokenizing with base
947    *   ...
948    *   if (<i>switch_condition</i>) {
949    *     base.switchTo(embedded);
950    *   }
951    *
952    *   // tokenizing with embedded
953    *   ...
954    *   if (<i>switch_condition</i>) {
955    *     embedded.switchTo(base);
956    *   }
957    *</pre></blockquote>
958    * That way we avoid a more complex synchronisation between tokenizers whenever
959    * one of them parses the next data in the input stream. However, the danger
960    * of not synchronized tokenizers remains, so take care.
961    *<br>
962    * Since is might be possible that the given <code>tokenizer</code> is a
963    * derivation of the <code>AbstractTokenizer</code> class, this method is
964    * synchronized on <code>tokenizer</code>.
965    *
966    * @param tokenizer   the tokenizer that should be used from now on
967    */
968   public void switchTo(AbstractTokenizer tokenizer) throws TokenizerException {
969     if (tokenizer != null) {
970       synchronized(tokenizer) {
971         if (tokenizer._baseTokenizer != _baseTokenizer) {
972           throw new TokenizerException("Trying to switch to an alien tokenizer (not added with addTokenizer).", null);
973         }
974         tokenizer._eofReached       = this._eofReached;
975         tokenizer._currentReadPos   = this._currentReadPos;
976         tokenizer._currentWritePos  = this._currentWritePos;
977         tokenizer._columnNumber     = this._columnNumber;
978         tokenizer._lineNumber       = this._lineNumber;
979         tokenizer._position2LineMap = this._position2LineMap;
980       }
981     } else {
982       throw new TokenizerException(new NullPointerException  ());
983     }
984   }
985 
986 
987   //---------------------------------------------------------------------------
988   // Methods that may be overwritten in derived classes
989   //
990   
991   /**
992    * This method checks if the character is a whitespace. Implement Your own
993    * code for situations where this default implementation is not fast enough
994    * or otherwise not really good.
995    *
996    * @param testChar  check this character
997    * @return <code>true</code> if the given character is a whitespace,
998    *         <code>false</code> otherwise
999    */
1000  protected boolean isWhitespace(char testChar) {
1001    if (_whitespaceHandler != null) {
1002      return _whitespaceHandler.isWhitespace(testChar);
1003    } else {
1004      return false;
1005    }
1006  }
1007      
1008  /**
1009   * This method detects the number of whitespace characters starting at the given
1010   * position. It should return the number of characters identified as whitespaces
1011   * starting from and including the given start position.
1012   *<br>
1013   * Then overriding this method, use {@link #getBaseDataProvider} to access characters.
1014   *<br>
1015   * Do not attempt to actually read more data or do anything that leads to the
1016   * change of the data source or to tokenizer switching. This is done by the 
1017   * tokenizer framework.
1018   *
1019   * @param   startingAtPos  start checking for whitespace from this position
1020   * @param   maxChars  if there is no non-whitespace character, read up to this number of characters 
1021   * @return  number of whitespace characters starting from the given offset
1022   * @throws  TokenizerException failure while reading data from the input stream
1023   */
1024  protected int readWhitespaces(int startingAtPos, int maxChars) throws TokenizerException {
1025    if (_whitespaceHandler != null) {
1026      DataProvider dataProvider = getBaseDataProvider(startingAtPos, maxChars);
1027      return _whitespaceHandler.countLeadingWhitespaces(dataProvider);
1028    } else {
1029      return 0;
1030    }
1031  }
1032  
1033  /**
1034   * This method checks if the character sequence starting at a given position
1035   * with a given lenghth is a keyword. If so, it returns the keyword description
1036   * as {@link TokenizerProperty} object.
1037   *
1038   * @param   startingAtPos   check at this position
1039   * @param   length          the candidate has this number of characters
1040   * @throws  TokenizerException routed exception from the active {@link de.susebox.jtopas.spi.KeywordHandler}
1041   * @return  {@link TokenizerProperty} describing the keyword or <code>null</code>
1042   */
1043  protected TokenizerProperty isKeyword(int startingAtPos, int length) throws TokenizerException {
1044    if (_keywordHandler != null) {
1045      DataProvider dataProvider = getBaseDataProvider(startingAtPos, length);
1046      return _keywordHandler.isKeyword(dataProvider);
1047    } else {
1048      return null;
1049    }
1050  }
1051  
1052  
1053  //---------------------------------------------------------------------------
1054  // TokenizerPropertyListener methods
1055  //
1056  
1057  /**
1058   * Splits a given String into lines. The method ist used to retrieve the
1059   * image parts of several token types.
1060   *
1061   * @param   image   split this string into lines
1062   * @return  an array containing the lines of the image without line separator
1063   *          characters
1064   */
1065  protected String  [] splitIntoLines(String   image) {
1066    LinkedList    lines = new LinkedList  ();
1067    int         index = 0;
1068    int         start = 0;
1069    
1070    while (index < image.length()) {
1071      switch (image.charAt(index)) {
1072      case '\r':
1073        lines.add(image.substring(start, index));
1074        if (index + 1 < image.length() && image.charAt(index + 1) == '\n') {
1075          index += 2;
1076        } else {
1077          index++;
1078        }
1079        start = index;
1080        break;
1081      case '\n':
1082        lines.add(image.substring(start, index));
1083        start = ++index;
1084        break;
1085      default:
1086        index++;
1087      }
1088    }
1089    
1090    if (start < index || start > 0) {
1091      lines.add(image.substring(start, index));
1092    }
1093  
1094    return (String  [])lines.toArray(new String  [lines.size()]);
1095  }
1096  
1097  /**
1098   * Splits a given string into lines and removing string escapes. The method is
1099   * used to retrieve the image parts for string token types.
1100   *
1101   * @param   prop    the {@link TokenizerProperty} describing a string
1102   * @param   image   split this string into lines
1103   * @return  an array containing the lines of the image without line separator
1104   *          characters
1105   */
1106  protected String  [] splitString(TokenizerProperty prop, String   image) {
1107    // complete string
1108    String  []      images        = prop.getImages();
1109    String          begin         = images[0];
1110    String          end           = images[1];
1111    String          esc           = images[2];
1112    boolean       noCase        = isFlagSet(prop, Flags.F_NO_CASE);
1113    boolean       escEqualsEnd  =    ( ! noCase && esc.compareTo(end)           == 0)
1114                                  || (   noCase && esc.compareToIgnoreCase(end) == 0);
1115
1116    StringBuffer    buffer        = null;
1117    int           index         = begin.length();
1118    int           start         = index;
1119    int           endIndex;
1120    
1121    if (   image.length() - start >= end.length()
1122        && (   ( ! noCase && end.equals(image.substring(image.length() - end.length())))
1123            || (   noCase && end.equalsIgnoreCase(image.substring(image.length() - end.length()))))) {
1124      endIndex = image.length() - end.length();    
1125    } else {
1126      endIndex = image.length();
1127    }
1128    
1129    while (index < endIndex) {
1130      if (   ( ! noCase && image.startsWith(esc, index))
1131          || (   noCase && image.substring(index, index + esc.length()).equalsIgnoreCase(esc))) {
1132        if (buffer == null) {
1133          buffer = new StringBuffer  (image.length());
1134        }
1135        buffer.append(image.substring(start, index));
1136        index += esc.length();
1137        if (index < image.length()) {
1138          if (   ( ! noCase && image.startsWith(esc, index))
1139              || (   noCase && image.substring(index, index + esc.length()).equalsIgnoreCase(esc))) {
1140            buffer.append(esc);
1141            index += esc.length();
1142          } else if (   ( ! noCase && image.startsWith(begin, index))
1143                     || (   noCase && image.substring(index, index + begin.length()).equalsIgnoreCase(begin))) {
1144            buffer.append(begin);
1145            index += begin.length();
1146          } else if (   ( ! noCase && image.startsWith(end, index))
1147                     || (   noCase && image.substring(index, index + end.length()).equalsIgnoreCase(end))) {
1148            buffer.append(end);
1149            index += end.length();
1150          }
1151        }
1152        start = index;
1153      }
1154      index++;
1155    }
1156    
1157    if (buffer != null && start < index) {
1158      buffer.append(image.substring(start, endIndex));
1159    }
1160  
1161    return splitIntoLines((buffer != null) ? buffer.toString() : image.substring(start, endIndex));
1162  }
1163  
1164  /**
1165   * Splits a given block comment into lines. The method is used to retrieve the 
1166   * image parts for block comment token types.
1167   *
1168   * @param   prop    the {@link TokenizerProperty} describing a block comment
1169   * @param   image   split this string into lines
1170   * @return  an array containing the lines of the image without line separator
1171   *          characters
1172   */
1173  protected String  [] splitBlockComment(TokenizerProperty prop, String   image) {
1174    // complete string
1175    String  []  images  = prop.getImages();
1176    String      start   = images[0];
1177    String      end     = images[1];
1178    boolean   noCase  = isFlagSet(prop, Flags.F_NO_CASE);
1179
1180    if (   image.length() - start.length() >= end.length()
1181        && (   ( ! noCase && end.equals(image.substring(image.length() - end.length())))
1182            || (   noCase && end.equalsIgnoreCase(image.substring(image.length() - end.length()))))) {
1183      return splitIntoLines(image.substring(start.length(), image.length() - end.length()));
1184    } else {
1185      return splitIntoLines(image.substring(start.length()));
1186    }
1187  }
1188  
1189  /**
1190   * Event handler method. The given {@link TokenizerPropertyEvent} parameter
1191   * contains the nessecary information about the property change. We choose
1192   * one single method in favour of various more specialized methods since the
1193   * reactions on adding, removing and modifying tokenizer properties are often
1194   * the same (flushing cash, rereading information etc.) are probably not very
1195   * different.
1196   *<br>
1197   * Note that a modification of the parse flags in the backing {@link TokenizerProperties}
1198   * object removes all flags previously modified through {@link #changeParseFlags}.
1199   *
1200   * @param event the {@link TokenizerPropertyEvent} that describes the change
1201   */
1202  public void propertyChanged(TokenizerPropertyEvent event) {
1203    TokenizerProperty prop   = event.getProperty();
1204    String  []          images = prop.getImages();
1205    
1206    synchronized(this) {
1207      switch (event.getType()) {
1208      case TokenizerPropertyEvent.PROPERTY_ADDED:
1209      case TokenizerPropertyEvent.PROPERTY_REMOVED:
1210        switch (prop.getType()) {
1211        case Token.LINE_COMMENT:
1212        case Token.BLOCK_COMMENT:
1213        case Token.STRING:
1214        case Token.SPECIAL_SEQUENCE:
1215          if (    (_internalFlags & IFLAG_EXTERNAL_SEQUENCE_HANDLER) == 0
1216              &&  _properties instanceof de.susebox.jtopas.spi.SequenceHandler) {
1217            setSequenceHandler((de.susebox.jtopas.spi.SequenceHandler)_properties);
1218          }
1219          break;
1220        case Token.KEYWORD:
1221          if (    (_internalFlags & IFLAG_EXTERNAL_KEYWORD_HANDLER) == 0
1222              &&  _properties instanceof de.susebox.jtopas.spi.KeywordHandler) {
1223            setKeywordHandler((de.susebox.jtopas.spi.KeywordHandler)_properties);
1224          }
1225          break;
1226        case Token.PATTERN:
1227          if (    (_internalFlags & IFLAG_EXTERNAL_PATTERN_HANDLER) == 0
1228              &&  _properties instanceof de.susebox.jtopas.spi.PatternHandler) {
1229            setPatternHandler((de.susebox.jtopas.spi.PatternHandler)_properties);
1230          }
1231          break;
1232        }
1233        break;
1234        
1235      case TokenizerPropertyEvent.PROPERTY_MODIFIED:
1236        switch (prop.getType()) {
1237        case TokenizerProperty.PARSE_FLAG_MASK:
1238          _flags    = getTokenizerProperties().getParseFlags();
1239          _flagMask = 0;
1240          if (isFlagSet(Flags.F_COUNT_LINES)) {
1241            if (_lineNumber < 0) {
1242              if (_position2LineMap != null) {
1243                _position2LineMap.clear();
1244              }
1245              _lineNumber = 0;
1246              putPosition(_currentReadPos, _lineNumber);
1247            }
1248            if (_columnNumber < 0) {
1249              _columnNumber = 0;
1250            }
1251          } else {
1252            _lineNumber   = -1;
1253            _columnNumber = -1;
1254          }
1255          break;
1256        }
1257        break;
1258      }
1259    }
1260  }
1261  
1262
1263  //---------------------------------------------------------------------------
1264  // Implementation
1265  //
1266  
1267  /**
1268   * Embedded tokenizers have their base tokenizer they share the input stream
1269   * with.
1270   *
1271   * @return the base tokenizer (the one owning the input stream and text buffer)
1272   */
1273  protected AbstractTokenizer getBaseTokenizer() {
1274    return _baseTokenizer;
1275  }
1276  
1277  /**
1278   * Returns the {@link de.susebox.jtopas.spi.DataProvider} of the base tokenizer.
1279   * This is this tokenizer if it is not an embedded one.
1280   *
1281   * @param   startPos    position in the input data
1282   * @param   length      number of characters
1283   * @return  the <code>DataProvider</code> for the given data range
1284   */
1285  protected DataProvider getBaseDataProvider(int startPos, int length) {
1286    return getBaseTokenizer().getDataProvider(startPos, length);
1287  }
1288
1289  /**
1290   * This method organizes the input buffer. It moves the current text window if
1291   * nessecary or allocates more space, if data should be kept completely (see the
1292   * {@link TokenizerProperties#F_KEEP_DATA} flag).
1293   * Its main purpose is to call the {@link TokenizerSource#read} method.
1294   *
1295   * @return  number of read bytes or -1 if an end-of-file condition occured
1296   * @throws  TokenizerException wrapped exceptions from the {@link TokenizerSource#read} 
1297   *          method
1298   */
1299  protected int readMoreDataFromBase() throws TokenizerException  {
1300    // its always the base tokenizer doing the reading
1301    int readChars = -1;
1302    
1303    if ( ! _eofReached) {
1304      AbstractTokenizer baseTokenizer = getBaseTokenizer();
1305
1306      if (baseTokenizer != this) {
1307        readChars = baseTokenizer.readMoreData();
1308      } else {
1309        readChars = readMoreData();
1310      }
1311      if (readChars > 0) {
1312        _currentWritePos += readChars;
1313      } else if (readChars < 0) {
1314        readChars   = -1;
1315        _eofReached = true;
1316      }
1317
1318      // Inform all embedded tokenizers about input buffer changes
1319      synchronizeAll();
1320    }
1321    return readChars;
1322  }
1323  
1324  /**
1325   * When the method {@link #readMoreData} changes the contents of the input buffer 
1326   * or the input buffer itself, all embedded tokenizers must be synchronized.
1327   * That means their member variables are adjusted to the base tokenizer.
1328   *
1329   * @throws TokenizerException if something goes wrong
1330   */
1331  protected void synchronizeAll() throws TokenizerException {
1332    AbstractTokenizer embedded = getBaseTokenizer();
1333
1334    while ((embedded = embedded._nextTokenizer) != null) {
1335      switchTo(embedded);   // adjust the member variables
1336    }
1337  }
1338
1339  /**
1340   * Checks the EOF condition at the given offset.
1341   *
1342   * @param offset  check at this position relative to the current read position
1343   * @return <code>true</code> if EOF has been reached, <code>false</code> otherwise
1344   * @throws TokenizerException failure while reading data from the input stream
1345   */
1346  protected boolean isEOF(int offset) throws TokenizerException {
1347    if (_currentReadPos + offset < _currentWritePos || readMoreDataFromBase() > 0) {
1348      return false;
1349    } else {
1350      _scannedToken[1] = new Token(Token.EOF);
1351      return true;
1352    }
1353  }
1354    
1355  /**
1356   * The number of characters until the next comment, whitespace, string, special
1357   * sequence or separator are determined. The character sequnce is then checked
1358   * for keyword or pattern matching.
1359   *
1360   * @param token buffer to receive information about the keyword or normal token
1361   * @return <code>null</code> or a {@link TokenizerProperty} if a keyword or pattern is found
1362   * @throws TokenizerException failure while reading data from the input stream
1363   */
1364  protected TokenizerProperty completeBoundedToken(Token token) throws TokenizerException {
1365    // find out the return value (length of normal token)
1366    int len = 1;  // the first character is a normal one, see call of this method
1367    
1368    while ( ! (   isEOF(len) 
1369               || isWhitespace(len) 
1370               || isPattern(len, true)
1371               || isSpecialSequence(len) 
1372               || isSeparator(len))) {
1373      len++;
1374    }
1375    token.setLength(len);
1376    
1377    // test on keyword or non-free pattern
1378    TokenizerProperty     prop = null;
1379    PatternHandler.Result result;
1380    
1381    if ((prop = isKeyword(_currentReadPos, len)) != null) {
1382      token.setType(Token.KEYWORD); 
1383      token.setCompanion(prop.getCompanion());
1384    } else {
1385      token.setType(Token.NORMAL);
1386    }
1387    return prop;
1388  }
1389  
1390  /**
1391   * After having identified a whitespace, this method continues to read data
1392   * until it detects a non-whitespace.
1393   *
1394   * @return number of consecutive whitespaces
1395   * @throws TokenizerException failure while reading data from the input stream
1396   */
1397  protected int completeWhitespace() throws TokenizerException {
1398    int start     = _currentReadPos + 1;  // the first whitespace we have already
1399    int available = _currentWritePos - start;
1400    int len       = readWhitespaces(start, available);
1401    
1402    while (len == available) {
1403      if (readMoreDataFromBase() <= 0) {
1404        break;
1405      }
1406      start    += len;
1407      available = _currentWritePos - start;
1408      len      += readWhitespaces(start, available);
1409    }
1410    return len + 1;   // the first whitespace we had already
1411  }
1412  
1413  /**
1414   * This method checks at the given offset if it is a whitespace.
1415   *
1416   * @param offset  check at this position relative to the current read position
1417   * @throws TokenizerException failure while reading data from the input stream
1418   * @return <code>true</code> if a whitespace sequence was found at the given offset,
1419   *         <code>false</code> otherwise
1420   */
1421  protected boolean isWhitespace(int offset) throws TokenizerException {
1422    if (_whitespaceHandler != null) {
1423      if (_currentReadPos + offset >= _currentWritePos && readMoreDataFromBase() < 0) {
1424        return false;
1425      }
1426    
1427      if (isWhitespace(getChar(_currentReadPos + offset))) {
1428        _scannedToken[1] = new Token(Token.WHITESPACE);
1429        return true;
1430      }
1431    }
1432    return false;
1433  }
1434      
1435  /**
1436   * This method checks at the given offset if it contains a separator. 
1437   *
1438   * @param offset  check at this position relative to the current read position
1439   * @throws TokenizerException failure while reading data from the input stream
1440   * @return <code>true</code> if a separator was found atthe given offset,
1441   *         <code>false</code> otherwise
1442   */
1443  protected boolean isSeparator(int offset) throws TokenizerException {
1444    if (    _separatorHandler != null
1445        &&  (_currentReadPos + offset < _currentWritePos ||  readMoreDataFromBase() > 0)
1446        &&  _separatorHandler.isSeparator(getChar(_currentReadPos + offset))) {
1447      _scannedToken[1] = new Token(Token.SEPARATOR);
1448      return true;
1449    } else {
1450      return false;
1451    }
1452  }
1453  
1454  /**
1455   * Testing for pattern matching.
1456   *
1457   * @param   offset            check at this position relative to the current read position
1458   * @param   freePatternOnly   if <code>true</code> consider only pattern that can occur anywhere in the data
1459   * @throws  TokenizerException failure while reading data from the input stream
1460   * @return  <code>true</code> if a pattern match was found at the given offset,
1461   *          <code>false</code> otherwise
1462   */
1463  protected boolean isPattern(int offset, boolean freePatternOnly) throws TokenizerException {
1464    if (_patternHandler != null) {
1465      // for pattern, we might need a lot of data
1466      int startingAtPos = _currentReadPos + offset;
1467      
1468      while (_currentWritePos - startingAtPos < PATTERN_MAX_SIZE) {
1469        if (readMoreDataFromBase() <= 0) {
1470          break;
1471        }
1472      }
1473
1474      // try pattern matching
1475      DataProvider          dataProvider  = getBaseDataProvider(startingAtPos, _currentWritePos - startingAtPos);
1476      PatternHandler.Result result        = _patternHandler.matches(dataProvider);
1477      boolean               isFree        = (result != null) ? isFlagSet(result.getProperty(), Flags.F_FREE_PATTERN) : false;
1478      
1479      if (result != null && (isFree || ! freePatternOnly)) {
1480        if ( ! isFree) {
1481          int nextOffset = offset + result.getLengthOfMatch();
1482          
1483          if (   isEOF(nextOffset)
1484              || isWhitespace(nextOffset)
1485              || isPattern(nextOffset, true)
1486              || isSpecialSequence(nextOffset)
1487              || isSeparator(nextOffset)) {
1488            _scannedToken[2] = _scannedToken[1];
1489          } else {
1490            return false;
1491          }
1492        }
1493        _scannedToken[1] = new Token(Token.PATTERN, null, result.getProperty());
1494        _scannedToken[1].setLength(result.getLengthOfMatch());
1495        if (isFlagSet(result.getProperty(), Flags.F_RETURN_IMAGE_PARTS)) {
1496          _scannedToken[1].setImageParts(result.getGroups());
1497        }
1498        return true;
1499      }
1500    }
1501    
1502    // no pattern matching available or no match found
1503    return false;
1504  }
1505  
1506  /**
1507   * This method checks at the given offset if it contains a a special sequence. 
1508   * Unlike the method {@link #test4SpecialSequence} it does nothing more.
1509   *
1510   * @param offset  check at this position relative to the current read position
1511   * @throws TokenizerException failure while reading data from the input stream
1512   * @return <code>true</code> if a special sequence was found at the given offset,
1513   *         <code>false</code> otherwise
1514   */
1515  protected boolean isSpecialSequence(int offset) throws TokenizerException {
1516    if (_sequenceHandler != null) {
1517      // do we need more data to ensure enough characters for even the longest
1518      // possible sequence match 
1519      int startingAtPos = _currentReadPos + offset;
1520    
1521      while (_sequenceHandler.getSequenceMaxLength() > _currentWritePos - startingAtPos) {
1522        if (readMoreDataFromBase() <= 0) {
1523          break;
1524        }
1525      }
1526      
1527      // invoke the sequence handler
1528      DataProvider      dataProvider = getBaseDataProvider(startingAtPos, _currentWritePos - startingAtPos);
1529      TokenizerProperty prop         = _sequenceHandler.startsWithSequenceCommentOrString(dataProvider);
1530    
1531      if (prop != null) {
1532        _scannedToken[1] = new Token(prop.getType(), null, prop);
1533        return true;
1534      }
1535    }
1536    
1537    // no sequence handler given or no special sequence at given offset
1538    return false;
1539  }
1540  
1541  /**
1542   * Completing a line comment. After a line comment sequence has been found, all 
1543   * characters up to and including the end-of-line combination belong to the 
1544   * line comment. Note that on reaching end-of-file a line comment does not 
1545   * nessecarily ends with an end-of-line sequence (linefeed for example).
1546   *
1547   * @param   prop    the property describing the line comment to complete
1548   * @return  length of the line comment
1549   * @throws  TokenizerException failure while reading data from the input stream
1550   */
1551  protected int completeLineComment(TokenizerProperty prop) throws TokenizerException {
1552    String  []  images = prop.getImages();
1553    int       len    = images[0].length();
1554
1555    while (_currentReadPos + len < _currentWritePos || readMoreDataFromBase() > 0) {
1556      switch (getChar(_currentReadPos + len)) {
1557      case '\r':
1558        len++;
1559        if (_currentReadPos + len < _currentWritePos || readMoreDataFromBase() > 0) {
1560          if (getChar(_currentReadPos + len) == '\n') {
1561            len++;
1562          }
1563        }
1564        return len;
1565      case '\n':
1566        len++;      
1567        return len;
1568      default:
1569        len++;
1570      }
1571    }
1572    return len;
1573  }
1574  
1575  /**
1576   * Completing a block comment. After a block comment sequence has been found, all
1577   * characters up to and including the end sequence of the block comment belong 
1578   * to the block comment. Note that on reaching end-of-file a block comment does 
1579   * not nessecarily ends with an end-of-block-comment sequence.
1580   *
1581   * @param   prop    the property describing the block comment to complete
1582   * @return  length of the block comment
1583   * @throws  TokenizerException failure while reading data from the input stream
1584   */
1585  protected int completeBlockComment(TokenizerProperty prop) throws TokenizerException {
1586    String  []  images = prop.getImages();
1587    String      start  = images[0];
1588    String      end    = images[1];
1589    boolean   noCase = isFlagSet(prop, Flags.F_NO_CASE);
1590    boolean   nested = isFlagSet(prop, Flags.F_ALLOW_NESTED_COMMENTS);
1591    int       len    = start.length();
1592    int       level  = 0;
1593
1594    __LOOP__:
1595    do {
1596      // test on nested comments: we take only care for nesting the same
1597      // block comment
1598      if (nested) {
1599        switch (comparePrefix(len, start, noCase)) {
1600        case 0:     // comment start identified
1601          level++;
1602          len += start.length();
1603          continue __LOOP__;
1604        case -1:    // EOF reached
1605          return _currentWritePos - _currentReadPos;
1606        }
1607      }
1608      
1609      // is it the end ?
1610      switch (comparePrefix(len, end, noCase)) {
1611      case 0:       // comment end identified
1612        level--;
1613        len += end.length();
1614        break;
1615      case -1:      // EOF reached
1616        return _currentWritePos - _currentReadPos;
1617      default:
1618        len++;
1619      }
1620    } while (level >= 0);
1621
1622    // block comment regularly terminated
1623    return len;
1624  }
1625  
1626  /**
1627   * Completing a string. After a string start sequence has been found, all
1628   * characters up to and including the end-of-string sequence belong to the
1629   * string. Note that on reaching end-of-file a string does not nessecarily ends 
1630   * with an end-of-string sequence.
1631   *
1632   * @param   prop    the property describing the string to complete
1633   * @return  length of the string
1634   * @throws  TokenizerException failure while reading data from the input stream
1635   */
1636  protected int completeString(TokenizerProperty prop) throws TokenizerException {
1637    // complete string
1638    String  []  images        = prop.getImages();
1639    String      start         = images[0];
1640    String      end           = images[1];
1641    String      esc           = images[2];
1642    int       len           = start.length();
1643    boolean   noCase        = isFlagSet(prop, Flags.F_NO_CASE);
1644    boolean   escEqualsEnd  =    ( ! noCase && esc.compareTo(end)           == 0)
1645                              || (   noCase && esc.compareToIgnoreCase(end) == 0);
1646
1647    while (true) {
1648      // test on escape
1649      if (esc != null) {
1650        switch (comparePrefix(len, esc, noCase)) {
1651        case 0:       // escape found
1652          len += esc.length();
1653          if (escEqualsEnd) {
1654            switch (comparePrefix(len, end, noCase)) {
1655            case 0:
1656              len += end.length();
1657              break;
1658            case -1:      // EOF reached
1659              return _currentWritePos - _currentReadPos;
1660            default:      // this is the regular return point if the esc is the string end
1661              return len;   
1662            }
1663          } else {
1664            len++;        // esc != string end: skip the next character
1665          }
1666          continue;
1667        case -1:          // EOF reached
1668          return _currentWritePos - _currentReadPos;
1669        }
1670      }
1671
1672      // test on end sequence
1673      switch (comparePrefix(len, end, noCase)) {
1674      case 0:             // this is the regular return point if esc != string end
1675        len += end.length();    
1676        return len;   
1677      case -1:            // EOF reached    
1678        return _currentWritePos - _currentReadPos;
1679      default:
1680        len++;
1681      }
1682    }
1683  }
1684
1685  /**
1686   * This method compares the characters at the given offset (from the current
1687   * read position) with the given prefix.
1688   *
1689   * @param   offset  start comparing at this offset from the current read position
1690   * @param   prefic  compare read data with this prefix
1691   * @param   noCase  case- or not case-sensitive comparison
1692   * @throws  TokenizerException failure while reading data from the input stream
1693   * @return  0 if the the given prefix matches the input stream, -1 on EOF and
1694   *          1 if not matching
1695   */
1696  protected int comparePrefix(int offset, String   prefix, boolean noCase) 
1697    throws TokenizerException 
1698  {
1699    // compare
1700    int len = prefix.length();
1701    
1702    for (int pos = offset; pos < offset + len; ++pos) {
1703      // do we have enough data
1704      if (_currentReadPos + pos >= _currentWritePos && readMoreDataFromBase() < 0) {
1705        return -1;
1706      }
1707      
1708      // compare single character
1709      char c1 = prefix.charAt(pos - offset);
1710      char c2 = getChar(_currentReadPos + pos);
1711      
1712      if (   c1 != c2
1713          && (! noCase || Character.toUpperCase(c1) != Character.toUpperCase(c2))) {
1714        return 1;
1715      }
1716    }
1717    
1718    // found
1719    return 0;
1720  }
1721  
1722  /**
1723   * The method recomputes the line and column position of the tokenizer, if the 
1724   * flag {@link TokenizerProperties#F_COUNT_LINES} is set. It gets the token type of the
1725   * {@link Token} that has been retrieved by the calling {@link #nextToken}.
1726   * Using the tokenizer control flags and certain other information it tries to
1727   * to find end-of-line sequences as fast as possible. For example, a line 
1728   * comment should always contain a end-of-line sequence, so we can simply 
1729   * increase the line count and set the column count to 0.
1730   *
1731   * @param type    the type of the current token
1732   * @param length  the length of the current token
1733   */
1734  protected void adjustLineAndColumn(int type, int length) {
1735    // line and column counting not required
1736    if ( ! isFlagSet(Flags.F_COUNT_LINES)) {
1737      return;
1738    }
1739    
1740    // there might be a simple way to determine the current line and column position
1741    switch (type) {
1742    case Token.EOF:
1743      return;
1744        
1745    case Token.LINE_COMMENT:        // a line comment always ends with a newline
1746      _lineNumber++;
1747      _columnNumber = 0;
1748      putPosition(_currentReadPos + length, _lineNumber);
1749      return;
1750      
1751    case Token.SPECIAL_SEQUENCE:
1752    case Token.SEPARATOR:
1753    case Token.NORMAL:
1754    case Token.KEYWORD:
1755      if (_whitespaceHandler != null && _whitespaceHandler.newlineIsWhitespace()) { // newline is a whitespace character
1756        _columnNumber += length;                      // it should therefore not occure in other
1757        return;                                       // tokens
1758      }
1759      break;
1760        
1761    case Token.WHITESPACE:
1762      if ( ! (_whitespaceHandler.isWhitespace('\n') || _whitespaceHandler.isWhitespace('\r'))) {  
1763        _columnNumber += length;                      // newline is not a whitespace; we do not have 
1764        return;                                       // to test for it in the current token
1765      }
1766      break;
1767    }
1768    
1769    // count it
1770    int newLineNumber = _lineNumber;
1771    
1772    for (int pos = _currentReadPos; pos < _currentReadPos + length; ++pos) {
1773      switch (getChar(pos)) {
1774      case '\r':
1775        if (pos + 1 >= _currentReadPos + length || getChar(pos + 1) != '\n') {
1776          _lineNumber++;
1777          _columnNumber = 0;
1778          putPosition(pos + 1, _lineNumber);
1779          break;
1780        }
1781        pos++;
1782        /* no break; */
1783      case '\n':
1784        _lineNumber++;
1785        _columnNumber = 0;
1786        putPosition(pos + 1, _lineNumber);
1787        break;
1788        
1789      default:
1790        _columnNumber++;
1791      }
1792    }
1793  }
1794  
1795  /**
1796   * Putting a new position into the position-to-line-number map.
1797   *
1798   * @param position  the position to map to the current line number
1799   */
1800  private void putPosition(int position, int lineNumber) {
1801    if (_position2LineMap == null) {
1802      _position2LineMap = new TreeMap  ();
1803    }
1804    _position2LineMap.put(new Integer  (position), new Integer  (lineNumber));
1805  }
1806  
1807  /**
1808   * Checking a given flag. The method considers both the globally set flags
1809   * in the associated {@link TokenizerProperties} instance and the locally set
1810   * by {@link #changeParseFlags}.
1811   *
1812   * @param flag one of the <code>F_...</code> flags defined in {@link TokenizerProperties}
1813   */
1814  protected boolean isFlagSet(int flag) {
1815    return (getParseFlags() & flag) != 0;
1816  }
1817  
1818  /**
1819   * Checking if a given flag is set for the given {@link TokenizerProperty}, for
1820   * this <code>Tokenizer</code> or for the used {@link TokenizerProperties}. The method considers both the globally set flags
1821   * in the associated {@link TokenizerProperties} instance and the locally set
1822   * by {@link #changeParseFlags}.
1823   *
1824   * @param prop  check the flag for this property
1825   * @param flag  one of the {@link Flags} constants
1826   */
1827  protected boolean isFlagSet(TokenizerProperty prop, int flag) {
1828    return prop.isFlagSet(flag, (getTokenizerProperties().getParseFlags() & flag) != 0 || isFlagSet(flag));
1829  }
1830  
1831  
1832  //---------------------------------------------------------------------------
1833  // Class members
1834  //
1835  
1836  /**
1837   * mask of flags that can be set separately for a <code>AbstractTokenizer</code>.
1838   */
1839  protected static final int VALID_FLAGS_MASK = 
1840      Flags.F_RETURN_WHITESPACES 
1841    | Flags.F_TOKEN_POS_ONLY
1842    | Flags.F_KEEP_DATA
1843    | Flags.F_COUNT_LINES;
1844  
1845  /**
1846   * {@link TokenizerProperties} tha tare used if no others have been 
1847   * specified by calling {@link #setTokenizerProperties}.
1848   */
1849  protected StandardTokenizerProperties _defaultProperties = null;
1850  
1851  /**
1852   * Buffer sizes
1853   */
1854  private static final int PATTERN_MAX_SIZE = 0x40000;   // 256K
1855  
1856  /**
1857   * Bits for the internal flag bitmask
1858   */
1859  private static final byte IFLAG_EXTERNAL_PATTERN_HANDLER  = 0x01;
1860  private static final byte IFLAG_EXTERNAL_KEYWORD_HANDLER  = 0x02;
1861  private static final byte IFLAG_EXTERNAL_SEQUENCE_HANDLER = 0x04;
1862
1863  
1864  //---------------------------------------------------------------------------
1865  // Members
1866  //
1867  
1868  /**
1869   * overall tokenizer flags.
1870   */
1871  protected int _flags = 0;
1872  
1873  /**
1874   * a combination of <code>F_...</code> constants defined in {@link TokenizerProperties}
1875   * indicating which bits in the {@link #_flags} member are valid. All other 
1876   * flags are taken from the associated {@link TokenizerProperties} object.
1877   *
1878   * @see #changeParseFlags
1879   */
1880  private int _flagMask = 0;
1881  
1882  /**
1883   * Flag if EOF has been reached. The flag should speed up calls to {@link readMoreDataFromBase}
1884   */
1885  private boolean _eofReached = true;
1886  
1887  /**
1888   * Data index there {@link #nextToken} will start parsing.
1889   */
1890  protected int _currentReadPos = 0;
1891
1892  /**
1893   * Data index there {@link #readMoreDataFromBase} will fill in new data.
1894   */
1895  protected int _currentWritePos = 0;
1896  
1897  /**
1898   * if line counting is enabled, this contains the current line number starting
1899   * with 0.
1900   */
1901  protected int _lineNumber = -1;
1902
1903  /**
1904   * if line counting is enabled, this contains the current column number starting
1905   * with 0.
1906   */
1907  protected int _columnNumber = -1;
1908  
1909  /**
1910   * List of currently known token. The first element is the current token returned
1911   * by the last call to {@link #nextToken}. The following elements are look-ahead
1912   * token that have already been identified when extracting the current token.
1913   */
1914  protected Token[] _scannedToken = new Token[] { null, null, null };
1915  
1916  /**
1917   * For embedded tokenizers: this is the list of the succeding tokenizers
1918   */
1919  protected AbstractTokenizer _nextTokenizer = null;
1920
1921  /**
1922   * For embedded tokenizers: this is the base tokenizer that reads the data
1923   */
1924  protected AbstractTokenizer _baseTokenizer = null;
1925
1926  /**
1927   * For embedded tokenizers: this is the list of the previous tokenizers
1928   */
1929  protected AbstractTokenizer _prevTokenizer = null;
1930  
1931  /**
1932   * Whitespace handler
1933   */
1934  private de.susebox.jtopas.spi.WhitespaceHandler _whitespaceHandler = null;
1935
1936  /**
1937   * Separator handler
1938   */
1939  private de.susebox.jtopas.spi.SeparatorHandler _separatorHandler = null;
1940
1941  /**
1942   * Keyword handler
1943   */
1944  private de.susebox.jtopas.spi.KeywordHandler _keywordHandler = null;
1945
1946  /**
1947   * Sequence handler
1948   */
1949  private de.susebox.jtopas.spi.SequenceHandler _sequenceHandler = null;
1950  
1951  /**
1952   * Sequence handler
1953   */
1954  private de.susebox.jtopas.spi.PatternHandler _patternHandler = null;
1955  
1956  /**
1957   * The source of input data
1958   */
1959  private TokenizerSource _source = null;
1960  
1961  /**
1962   * The characteristics of this tokenizer.
1963   */
1964  private TokenizerProperties _properties = null;
1965  
1966  /**
1967   * Line number to position mapping
1968   */
1969  private TreeMap   _position2LineMap = null;
1970  
1971  /**
1972   * Control flags for the internal work
1973   */
1974  private long _internalFlags = 0;
1975}
1976
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags