KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > de > susebox > jtopas > AbstractTokenizer


1 /*
2  * AbstractTokenizer.java: base class for Tokenizer implementations.
3  *
4  * Copyright (C) 2004 Heiko Blau
5  *
6  * This file belongs to the JTopas Library.
7  * JTopas is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU Lesser General Public License as published by the
9  * Free Software Foundation; either version 2.1 of the License, or (at your
10  * option) any later version.
11  *
12  * This software is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.
15  * See the GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License along
18  * with JTopas. If not, write to the
19  *
20  * Free Software Foundation, Inc.
21  * 59 Temple Place, Suite 330,
22  * Boston, MA 02111-1307
23  * USA
24  *
25  * or check the Internet: http://www.fsf.org
26  *
27  * Contact:
28  * email: heiko@susebox.de
29  */

30
31 package de.susebox.jtopas;
32
33 //-----------------------------------------------------------------------------
34
// Imports
35
//
36
import java.io.Reader JavaDoc;
37 import java.util.SortedMap JavaDoc;
38 import java.util.TreeMap JavaDoc;
39 import java.util.LinkedList JavaDoc;
40 import java.util.Arrays JavaDoc;
41
42 import de.susebox.java.lang.ExtIndexOutOfBoundsException;
43
44 import de.susebox.jtopas.spi.WhitespaceHandler;
45 import de.susebox.jtopas.spi.KeywordHandler;
46 import de.susebox.jtopas.spi.PatternHandler;
47 import de.susebox.jtopas.spi.SeparatorHandler;
48 import de.susebox.jtopas.spi.SequenceHandler;
49
50 import de.susebox.jtopas.spi.StandardWhitespaceHandler;
51 import de.susebox.jtopas.spi.StandardKeywordHandler;
52 import de.susebox.jtopas.spi.StandardSeparatorHandler;
53 import de.susebox.jtopas.spi.StandardSequenceHandler;
54
55 import de.susebox.jtopas.spi.DataProvider;
56 import de.susebox.jtopas.spi.DataMapper;
57
58
59 //-----------------------------------------------------------------------------
60
// Class AbstractTokenizer
61
//
62

63 /**<p>
64  * Base class for {@link Tokenizer} implementations. <code>AbstractTokenizer</code>
65  * separates the data analysis from the actual data provision. Although the class
66  * maintains read and write positions the physical representation of the logical
67  * character buffer behind these positions concerns only the subclasses.
68  *</p>
69  *
70  * @see Tokenizer
71  * @see TokenizerProperties
72  * @author Heiko Blau
73  */

74 public abstract class AbstractTokenizer implements Tokenizer, TokenizerPropertyListener {
75
76   //---------------------------------------------------------------------------
77
// Abstract methods
78
//
79

80   /**
81    * Subclasses have to provide {@link de.susebox.jtopas.spi.DataProvider}
82    * instances for various token type handlers. The given start position is the
83    * absolute number of characters from the beginning of the data source.
84    *
85    * @param startPos position in the input data
86    * @param length number of characters
87    * @return the <code>DataProvider</code> for the given data range
88    */

89   protected abstract DataProvider getDataProvider(int startPos, int length);
90
91   /**
92    * This method is called when the tokenizer runs out of data. Its main purpose
93    * is to call the {@link TokenizerSource#read} method. It is also responsible
94    * to handle the flag {@link TokenizerProperties#F_KEEP_DATA} flag).
95    *
96    * @return number of read bytes or -1 if an end-of-file condition occured
97    * @throws TokenizerException wrapped exceptions from the {@link TokenizerSource#read}
98    * method
99    */

100   protected abstract int readMoreData() throws TokenizerException;
101   
102   
103   //---------------------------------------------------------------------------
104
// Constructors
105
//
106

107   /**
108    * Default constructor that sets the tokenizer control flags as it would be
109    * approbriate for C/C++ and Java. Found token images are copied. No line nor
110    * column informations are provided. Nested comments are not allowed.
111    *<br>
112    * The tokenizer will use the {@link TokenizerProperties#DEFAULT_WHITESPACES}
113    * and {@link TokenizerProperties#DEFAULT_SEPARATORS} for whitespace and
114    * separator handling.
115    */

116   public AbstractTokenizer() {
117     _baseTokenizer = this;
118     if (_defaultProperties == null) {
119       _defaultProperties = new StandardTokenizerProperties();
120     }
121     setTokenizerProperties(_defaultProperties);
122   }
123   
124   /**
125    * Contructing a <code>AbstractTokenizer</code> with a backing {@link TokenizerProperties}
126    * instance.
127    *
128    * @param properties an {@link TokenizerProperties} object containing the
129    * settings for the tokenizing process
130    */

131   public AbstractTokenizer(TokenizerProperties properties) {
132     _baseTokenizer = this;
133     setTokenizerProperties(properties);
134   }
135
136   
137   //---------------------------------------------------------------------------
138
// data source
139
//
140

141   /**
142    * Setting the source of data. This method is usually called during setup of
143    * the <code>Tokenizer</code> but may also be invoked while the tokenizing
144    * is in progress. It will reset the tokenizers input buffer, line and column
145    * counters etc.
146    *<br>
147    * Subclasses should override this method to do their own actions on a data source
148    * change. Generally, this base method should be called first in the subclass
149    * implementation of <code>setSource</code> (equivalent to super calls in
150    * constructors of derived classes).
151    *
152    * @param source a {@link TokenizerSource} to read data from
153    * @see #getSource
154    */

155   public void setSource(TokenizerSource source) {
156     _source = source;
157     _eofReached = false;
158     _currentReadPos = 0;
159     _currentWritePos = 0;
160     if (isFlagSet(Flags.F_COUNT_LINES)) {
161       _lineNumber = 0;
162       _columnNumber = 0;
163     } else {
164       _lineNumber = -1;
165       _columnNumber = -1;
166     }
167     Arrays.fill(_scannedToken, null);
168   }
169   
170   /**
171    * Convenience method to avoid the construction of a {@link TokenizerSource}
172    * from the most important data source {@link java.io.Reader}.
173    *
174    * @param reader the {@link java.io.Reader} to get data from
175    */

176   public void setSource(Reader JavaDoc reader) {
177     setSource(new ReaderSource(reader));
178   }
179    
180   /**
181    * Retrieving the {@link TokenizerSource} of this <code>Tokenizer</code>. The
182    * method may return <code>null</code> if there is no <code>TokenizerSource</code>
183    * associated with it.
184    *
185    * @param the {@link TokenizerSource} associated with this <code>Tokenizer</code>
186    * @see #setSource
187    */

188   public TokenizerSource getSource() {
189     return _source;
190   }
191   
192   
193   //---------------------------------------------------------------------------
194
// Methods of the Tokenizer interface
195
//
196

197   /**
198    * Setting the tokenizer characteristics. See the method description in
199    * {@link Tokenizer}.
200    *
201    * @param props the {@link TokenizerProperties} for this tokenizer
202    * @throws NullPointerException if the <code>null</code> is passed to the call
203    * @see #getTokenizerProperties
204    */

205   public void setTokenizerProperties(TokenizerProperties props) throws NullPointerException JavaDoc {
206     if (props == null) {
207       throw new NullPointerException JavaDoc();
208     }
209
210     // set properties
211
if (_properties != null) {
212       _properties.removeTokenizerPropertyListener(this);
213     }
214     _properties = props;
215     _properties.addTokenizerPropertyListener(this);
216
217     // who is going to handle the various token types ?
218
if (_properties instanceof WhitespaceHandler) {
219       setWhitespaceHandler((WhitespaceHandler)_properties);
220     } else {
221       setWhitespaceHandler(new StandardWhitespaceHandler(_properties));
222     }
223     if (_properties instanceof SeparatorHandler) {
224       setSeparatorHandler((SeparatorHandler)_properties);
225     } else {
226       setSeparatorHandler(new StandardSeparatorHandler(_properties));
227     }
228     if (_properties instanceof SequenceHandler) {
229       setSequenceHandler((SequenceHandler)_properties);
230     } else {
231       setSequenceHandler(new StandardSequenceHandler(_properties));
232     }
233     if (props instanceof KeywordHandler) {
234       setKeywordHandler((KeywordHandler)props);
235     } else {
236       setKeywordHandler(new StandardKeywordHandler(_properties));
237     }
238     if (_properties instanceof PatternHandler) {
239       setPatternHandler((PatternHandler)_properties);
240     } else {
241       setPatternHandler(null);
242     }
243       
244     // flag handling
245
int newFlags = _properties.getParseFlags();
246
247     if (newFlags != _flags) {
248       propertyChanged(new TokenizerPropertyEvent(
249                             TokenizerPropertyEvent.PROPERTY_MODIFIED,
250                             new TokenizerProperty(TokenizerProperty.PARSE_FLAG_MASK,
251                                                   new String JavaDoc[] { Integer.toBinaryString(newFlags) } ),
252                             new TokenizerProperty(TokenizerProperty.PARSE_FLAG_MASK,
253                                                   new String JavaDoc[] { Integer.toBinaryString(_flags) } )));
254     }
255   }
256
257   /**
258    * Retrieving the current tokenizer characteristics. See the method description
259      * in {@link Tokenizer}.
260    *
261    * @return the {@link TokenizerProperties} of this <code>Tokenizer</code>
262    * @see #setTokenizerProperties
263    */

264   public TokenizerProperties getTokenizerProperties() {
265     return _properties;
266   }
267   
268   /**
269    * Setting the control flags of the <code>Tokenizer</code>. See the method
270    * description in {@link Tokenizer}.
271    *
272    * @param flags the parser control flags
273    * @param mask the mask for the flags to set or unset
274    * @throws TokenizerException if one or more of the flags given cannot be honored
275    * @see #getParseFlags
276    */

277   public void changeParseFlags(int flags, int mask) throws TokenizerException {
278     // test the given flags
279
if ((mask | VALID_FLAGS_MASK) != VALID_FLAGS_MASK) {
280       throw new TokenizerException(
281                   "One or more flags cannot be set separately for a {0}. Violating flags in {1}: {2}.",
282                   new Object JavaDoc[] { AbstractTokenizer.class.getName(),
283                                  Integer.toHexString(flags),
284                                  Integer.toHexString(mask & ~VALID_FLAGS_MASK) } );
285     }
286     
287     // set the new flags for this tokenizer
288
_flagMask = mask;
289     _flags = (flags & mask) | (getTokenizerProperties().getParseFlags() & ~mask);
290
291     // when counting lines initialize the current line and column position
292
if ( ! isFlagSet(Flags.F_COUNT_LINES)) {
293       _lineNumber = 0;
294       _columnNumber = 0;
295     }
296   }
297
298   /**
299    * Retrieving the parser control flags. See the method description in
300    * {@link Tokenizer}.
301    *
302    * @return the current parser control flags
303    * @see #changeParseFlags
304    */

305   public int getParseFlags() {
306     return (getTokenizerProperties().getParseFlags() & ~_flagMask) + (_flags & _flagMask);
307   }
308   
309   /**
310    * Setting a new {@link de.susebox.jtopas.spi.KeywordHandler} or removing any
311    * previously installed one. See the method description in {@link Tokenizer}.
312    *
313    * @param handler the (new) {@link KeywordHandler} to use or <code>null</code>
314    * to remove it
315    */

316   public void setKeywordHandler(de.susebox.jtopas.spi.KeywordHandler handler) {
317     synchronized(this) {
318       if (handler == _properties) {
319         if (_properties != null && _properties.getKeywords().hasNext()) {
320           _keywordHandler = handler;
321         } else {
322           _keywordHandler = null;
323         }
324         _internalFlags &= ~IFLAG_EXTERNAL_KEYWORD_HANDLER;
325       } else {
326         _keywordHandler = handler;
327         _internalFlags |= IFLAG_EXTERNAL_KEYWORD_HANDLER;
328       }
329     }
330   }
331   
332   /**
333    * Retrieving the current {@link de.susebox.jtopas.spi.KeywordHandler}. See the
334    * method description in {@link Tokenizer}.
335    *
336    * @return the currently active whitespace keyword or <code>null</code>, if
337    * keyword support is switched off
338    */

339   public de.susebox.jtopas.spi.KeywordHandler getKeywordHandler() {
340     synchronized(this) {
341       if ((_internalFlags & IFLAG_EXTERNAL_KEYWORD_HANDLER) == 0) {
342         return (de.susebox.jtopas.spi.KeywordHandler)getTokenizerProperties();
343       } else {
344         return _keywordHandler;
345       }
346     }
347   }
348   
349   /**
350    * Setting a new {@link de.susebox.jtopas.spi.WhitespaceHandler} or removing any
351    * previously installed one. See the method description in {@link Tokenizer}.
352    *
353    * @param handler the (new) whitespace handler to use or <code>null</code> to
354    * switch off whitespace handling
355    * @see #getWhitespaceHandler
356    */

357   public void setWhitespaceHandler(de.susebox.jtopas.spi.WhitespaceHandler handler) {
358     _whitespaceHandler = handler;
359   }
360   
361   /**
362    * Retrieving the current {@link de.susebox.jtopas.spi.WhitespaceHandler}. See
363    * the method description in {@link Tokenizer}.
364    *
365    * @return the currently active whitespace handler or null, if the base
366    * implementation is working
367    */

368   public de.susebox.jtopas.spi.WhitespaceHandler getWhitespaceHandler() {
369     return _whitespaceHandler;
370   }
371   
372   /**
373    * Setting a new {@link de.susebox.jtopas.spi.SeparatorHandler} or removing any
374    * previously installed <code>SeparatorHandler</code>. See the method description
375    * in {@link Tokenizer}.
376    *
377    * @param handler the (new) separator handler to use or <code>null</code> to
378    * remove it
379    * @see #getSeparatorHandler
380    */

381   public void setSeparatorHandler(de.susebox.jtopas.spi.SeparatorHandler handler) {
382     _separatorHandler = handler;
383   }
384   
385   /**
386    * Retrieving the current {@link de.susebox.jtopas.spi.SeparatorHandler}. See
387    * the method description in {@link Tokenizer}.
388    *
389    * @return the currently active {@link SeparatorHandler} or <code>null</code>,
390    * if separators aren't recognized by the tokenizer
391    * @see #setSequenceHandler
392    */

393   public de.susebox.jtopas.spi.SeparatorHandler getSeparatorHandler() {
394     return _separatorHandler;
395   }
396   
397   /**
398    * Setting a new {@link de.susebox.jtopas.spi.SequenceHandler} or removing any
399    * previously installed one. See the method description in {@link Tokenizer}.
400    *
401    * @param handler the (new) {@link SequenceHandler} to use or null to remove it
402    */

403   public void setSequenceHandler(de.susebox.jtopas.spi.SequenceHandler handler) {
404     synchronized(this) {
405       if (handler == _properties) {
406         if (_properties != null && ( _properties.getSpecialSequences().hasNext()
407                                     || _properties.getStrings().hasNext()
408                                     || _properties.getBlockComments().hasNext()
409                                     || _properties.getLineComments().hasNext())) {
410           _sequenceHandler = handler;
411         } else {
412           _sequenceHandler = null;
413         }
414         _internalFlags &= ~IFLAG_EXTERNAL_SEQUENCE_HANDLER;
415       } else {
416         _sequenceHandler = handler;
417         _internalFlags |= IFLAG_EXTERNAL_SEQUENCE_HANDLER;
418       }
419     }
420   }
421   
422   /**
423    * Retrieving the current {@link SequenceHandler}. See the method description
424    * in {@link Tokenizer}.
425    *
426    * @return the currently active {@link SequenceHandler} or null, if the base
427    * implementation is working
428    */

429   public de.susebox.jtopas.spi.SequenceHandler getSequenceHandler() {
430     synchronized(this) {
431       if ((_internalFlags & IFLAG_EXTERNAL_SEQUENCE_HANDLER) == 0) {
432         return (de.susebox.jtopas.spi.SequenceHandler)getTokenizerProperties();
433       } else {
434         return _sequenceHandler;
435       }
436     }
437   }
438   
439   /**
440    * Setting a new {@link de.susebox.jtopas.spi.PatternHandler} or removing any
441    * previously installed one. See the method description in {@link Tokenizer}.
442    *
443    * @param handler the (new) {@link de.susebox.jtopas.spi.PatternHandler} to
444    * use or <code>null</code> to remove it
445    * @see #getPatternHandler
446    */

447   public void setPatternHandler(de.susebox.jtopas.spi.PatternHandler handler) {
448     synchronized(this) {
449       if (handler == _properties) {
450         if (_properties != null && _properties.getPatterns().hasNext()) {
451           _patternHandler = handler;
452         } else {
453           _patternHandler = null;
454         }
455         _internalFlags &= ~IFLAG_EXTERNAL_PATTERN_HANDLER;
456       } else {
457         _patternHandler = handler;
458         _internalFlags |= IFLAG_EXTERNAL_PATTERN_HANDLER;
459       }
460     }
461   }
462   
463   /**
464    * Retrieving the current {@link de.susebox.jtopas.spi.PatternHandler}. See the
465    * method description in {@link Tokenizer}.
466    *
467    * @return the currently active {@link de.susebox.jtopas.spi.PatternHandler}
468    * or <code>null</code>, if patterns are not recognized by the tokenizer
469    * @see #setPatternHandler
470    */

471   public de.susebox.jtopas.spi.PatternHandler getPatternHandler() {
472     synchronized(this) {
473       if ((_internalFlags & IFLAG_EXTERNAL_PATTERN_HANDLER) == 0) {
474         return (de.susebox.jtopas.spi.PatternHandler)getTokenizerProperties();
475       } else {
476         return _patternHandler;
477       }
478     }
479   }
480   
481   /**
482    * Query the current row. The method can only be used if the flag {@link TokenizerProperties#F_COUNT_LINES}
483    * has been set. Without this flag being set, the return value is undefined.
484    *<br>
485    * Note that row counting starts with 0, while editors often use 1 for the first
486    * row.
487    *
488    * @return current row (starting with 0)
489    * or -1 if the flag {@link TokenizerProperties#F_COUNT_LINES} is set
490    */

491   public int getCurrentLine() {
492     return _lineNumber;
493   }
494   
495   /**
496    * Retrieve the current column. The method can only be used if the flag <code>F_COUNT_LINES</code>
497    * has been set.
498    * Without this flag being set, the return value is undefined.
499    * Note that column counting starts with 0, while editors often use 1 for the first
500    * column in one row.
501    *
502    * @return current column number (starting with 0)
503    */

504   public int getCurrentColumn() {
505     return _columnNumber;
506   }
507   
508   /**
509    * Checking if there are more tokens available. See the method description in
510    * {@link Tokenizer}.
511    *
512    * @return <code>true</code> if a ca_ll to {@link #nextToken} or {@link #nextImage}
513    * will succed, <code>false</code> otherwise
514    */

515   public boolean hasMoreToken() {
516     return _scannedToken[0] == null || _scannedToken[0].getType() != Token.EOF;
517   }
518   
519   /**
520    * Retrieving the next {@link Token}. See the method description in
521    * {@link Tokenizer}.
522    *
523    * @return found {@link Token} including the EOF token
524    * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
525    * (IOExceptions for instance)
526    */

527   public Token nextToken() throws TokenizerException {
528     boolean returnIt = false;
529     
530     // Get the next token
531
__MAIN_LOOP__:
532     do {
533       // analyze look-ahead token
534
if (_scannedToken[1] == null) {
535         if ( ! isEOF(0)) {
536           if ( ! isWhitespace(0)) {
537             if ( ! isPattern(0, false)) {
538               if ( ! isSpecialSequence(0)) {
539                 if ( ! isSeparator(0)) {
540                   _scannedToken[1] = new Token(Token.NORMAL);
541                 }
542               }
543             }
544           }
545         }
546       }
547       _scannedToken[0] = _scannedToken[1];
548       _scannedToken[1] = _scannedToken[2];
549       _scannedToken[2] = null;
550       
551       // get new token or complete the previously found look-ahead token
552
Token token = _scannedToken[0];
553       TokenizerProperty prop = (TokenizerProperty)token.getCompanion();
554       
555       token.setCompanion((prop != null) ? prop.getCompanion() : null);
556       token.setStartPosition(getReadPosition());
557       token.setStartLine(_lineNumber);
558       token.setStartColumn(_columnNumber);
559
560       returnIt = true;
561       
562       switch (token.getType()) {
563       case Token.EOF:
564         token.setLength(0);
565         break;
566       case Token.WHITESPACE:
567         token.setLength(completeWhitespace());
568         returnIt = isFlagSet(Flags.F_RETURN_SIMPLE_WHITESPACES);
569         break;
570       case Token.SEPARATOR: // Separators are always single characters.
571
token.setLength(1);
572         break;
573       case Token.STRING:
574         token.setLength(completeString(prop));
575         break;
576       case Token.LINE_COMMENT:
577         token.setLength(completeLineComment(prop));
578         returnIt = isFlagSet(prop, Flags.F_RETURN_LINE_COMMENTS);
579         break;
580       case Token.BLOCK_COMMENT:
581         token.setLength(completeBlockComment(prop));
582         returnIt = isFlagSet(prop, Flags.F_RETURN_BLOCK_COMMENTS);
583         break;
584       case Token.SPECIAL_SEQUENCE:
585         token.setLength(prop.getImages()[0].length());
586         break;
587       case Token.PATTERN:
588         // already contained in the first look-ahead token, see token shifting
589
break;
590       default:
591         prop = completeBoundedToken(token);
592       }
593       
594       // compute new line and column positions (if flag is set) and complete
595
// the token
596
adjustLineAndColumn(token.getType(), token.getLength());
597       token.setEndLine(_lineNumber);
598       token.setEndColumn(_columnNumber);
599
600       // need to extract the image ?
601
if (returnIt) {
602         boolean tokenPosOnly = (prop != null) ? isFlagSet(prop, Flags.F_TOKEN_POS_ONLY) :
603                                                     isFlagSet(Flags.F_TOKEN_POS_ONLY);
604         boolean returnImageParts = (prop != null) ? isFlagSet(prop, Flags.F_RETURN_IMAGE_PARTS) :
605                                                     isFlagSet(Flags.F_RETURN_IMAGE_PARTS);
606         if ( ! tokenPosOnly || returnImageParts) {
607           token.setImage(getText(_currentReadPos, token.getLength()));
608         }
609         if (returnImageParts) {
610           switch (token.getType()) {
611           case Token.WHITESPACE:
612             token.setImageParts(splitIntoLines(token.getImage()));
613             break;
614           case Token.STRING:
615             token.setImageParts(splitString(prop, token.getImage()));
616             break;
617           case Token.LINE_COMMENT:
618             token.setImageParts(splitIntoLines(token.getImage().substring(prop.getImages()[0].length())));
619             break;
620           case Token.BLOCK_COMMENT:
621             token.setImageParts(splitBlockComment(prop, token.getImage()));
622             break;
623           case Token.PATTERN:
624             break;
625           case Token.EOF:
626             token.setImageParts(new String JavaDoc[] {} );
627             break;
628           default:
629             token.setImageParts(new String JavaDoc[] { token.getImage() } );
630           }
631         }
632       }
633
634       // this is the one and only point where the current read position is
635
// adjusted (except for the data shifting in readMoreData).
636
_currentReadPos += token.getLength();
637     
638     } while ( ! returnIt);
639
640     // the current token is the first in the list
641
return _scannedToken[0];
642   }
643   
644   /**
645    * This method is a convenience method. It returns only the next token image
646    * without any informations about its type or associated information. See the
647    * method description in {@link Tokenizer}.
648    *
649    * @return the token image of the next token
650    * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
651    * (IOExceptions for instance)
652    * @see #currentImage
653    */

654   public String JavaDoc nextImage() throws TokenizerException {
655     nextToken();
656     return currentImage();
657   }
658   
659   /**
660    * Retrieve the {@link Token} that was found by the last call to {@link #nextToken}.
661    * See the method description in {@link Tokenizer}.
662    *
663    * @return the {@link Token} retrieved by the lahasest call to {@link #nextToken}.
664    * @throws TokenizerException if the tokenizer has no current token
665    */

666   public Token currentToken() throws TokenizerException {
667     if (_scannedToken[0] == null) {
668       throw new TokenizerException("No current token available (nextToken was not called / read position changed)");
669     }
670     return _scannedToken[0];
671   }
672  
673   /**
674    * Convenience method to retrieve only the token image of the {@link Token} that
675    * would be returned by {@link #currentToken}. See the method description in
676    * {@link Tokenizer}.
677    *
678    * @return the token image of the current token
679    * @see #currentToken
680    */

681   public String JavaDoc currentImage() throws TokenizerException {
682     Token token = currentToken();
683     
684     if (token.getType() == Token.EOF) {
685       return null;
686     } else if ( ! isFlagSet(Flags.F_TOKEN_POS_ONLY) || token.getImage() != null) {
687       return token.getImage();
688     } else {
689       return getText(token.getStartPosition(), token.getLength());
690     }
691   }
692   
693   /**
694    * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method will
695    * return the line number starting with 0 in the input stream. See the method
696    * description in {@link Tokenizer}.
697    *
698    * @return the current line number starting with 0 or -1 if no line numbers are supplied.
699    * @see #getColumnNumber
700    */

701   public int getLineNumber() {
702     return _lineNumber;
703   }
704   
705   /**
706    * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method will
707    * return the current column positionstarting with 0 in the input stream. See
708    * the method description in {@link Tokenizer}.
709    *
710    * @return the current column position
711    * @see #getLineNumber
712    */

713   public int getColumnNumber() {
714     return _columnNumber;
715   }
716   
717   /**
718    * Getting the current read offset. See the method description in
719    * {@link Tokenizer}.
720    *
721    * @return the absolute offset in characters from the start of the data source
722    * of the Tokenizer where reading will be continued
723    * @see #setReadPositionAbsolute
724    * @see #setReadPositionRelative
725    */

726   public int getReadPosition() {
727     return _currentReadPos;
728   }
729   
730   /**
731    * Retrieving the number of the currently available characters. See the method
732    * description in {@link Tokenizer}.
733    *
734    * @return number of currently available characters
735    */

736   public int currentlyAvailable() {
737     return _currentWritePos - getRangeStart();
738   }
739   
740   /**
741    * Try to read more data into the text buffer of the tokenizer. See the method
742    * description in {@link Tokenizer}.
743    *
744    * @return the number of character now available
745    * @throws TokenizerException generic exception (list) for all problems that
746    * may occur while reading (IOExceptions for instance)
747    */

748   public int readMore() throws TokenizerException {
749     readMoreDataFromBase();
750     return currentlyAvailable();
751   }
752
753   /**
754    * Returns the character at the given position. The method does not attempt to
755    * read more data.
756    *
757    * @param pos get character on this position in the data stream
758    * @return the character at the given position
759    * @throws IndexOutOfBoundsException if the parameter <code>pos</code> is not
760    * in the available text range (text window)
761    */

762   public char getChar(int pos) throws IndexOutOfBoundsException JavaDoc {
763     return getBaseDataProvider(pos, 1).getCharAt(0);
764   }
765   
766   /**
767    * Retrieve text from the currently available range. See the method description
768    * in {@link Tokenizer}.
769    *
770    * @param start position where the text begins
771    * @param len length of the text
772    * @return the text beginning at the given position ith the given length
773    * @throws IndexOutOfBoundsException if the starting position or the length
774    * is out of the current text window
775    */

776   public String JavaDoc getText(int start, int len) throws IndexOutOfBoundsException JavaDoc {
777     return getBaseDataProvider(start, len).toString();
778   }
779   
780   /**
781    * This method sets the tokenizers current read position to the given absolute
782    * read position. See the method description in {@link Tokenizer}.
783    *<br>
784    * When using this method with embedded tokenizers, the user is responsible to
785    * set the read position in the currently used tokenizer. It will be propagated
786    * by the next call to {@link #switchTo}. Until that point, a call to this
787    * method has no effect on the other tokenizers sharing the same data source.
788    *
789    * @param position absolute position for the next parse operation
790    * @throws IndexOutOfBoundsException if the parameter <code>position</code> is
791    * not in the available text range (text window)
792    * @see #setReadPositionRelative
793    */

794   public void setReadPositionAbsolute(int position) throws IndexOutOfBoundsException JavaDoc {
795     if (position < getRangeStart()) {
796       throw new ExtIndexOutOfBoundsException(
797                   "Invalid read position {0} below the current text window start {1}.",
798                   new Object JavaDoc[] { new Integer JavaDoc(position), new Integer JavaDoc(getRangeStart()) }
799                 );
800     } else if (position > _currentWritePos) {
801       throw new ExtIndexOutOfBoundsException(
802                   "Invalid read position {0} at or above the current text window end {1}.",
803                   new Object JavaDoc[] { new Integer JavaDoc(position), new Integer JavaDoc(currentlyAvailable() + getRangeStart()) }
804                 );
805     }
806     _currentReadPos = position;
807     Arrays.fill(_scannedToken, null);
808     
809     // adjust line and column counting
810
if (isFlagSet(Flags.F_COUNT_LINES)) {
811       SortedMap JavaDoc map = _position2LineMap.headMap(new Integer JavaDoc(position + 1));
812       
813       if (map != null && ! map.isEmpty()) {
814         Integer JavaDoc lastLineStart = (Integer JavaDoc)map.lastKey();
815         
816         _lineNumber = ((Integer JavaDoc)map.get(lastLineStart)).intValue();
817         _columnNumber = position - lastLineStart.intValue();
818       } else {
819         _lineNumber = 0;
820         _columnNumber = position;
821       }
822     }
823   }
824
825   /**
826    * This method sets the tokenizers new read position the given number of characters
827    * forward (positive value) or backward (negative value) starting from the current
828    * read position. See the method description in {@link Tokenizer}.
829    *<br>
830    * When using this method with embedded tokenizers, the user is responsible to
831    * set the read position in the currently used tokenizer. It will be propagated
832    * by the next call to {@link #switchTo}. Until that point, a call to this
833    * method has no effect on the other tokenizers sharing the same data source.
834    *
835    * @param offset number of characters to move forward (positive offset) or
836    * backward (negative offset)
837    * @throws IndexOutOfBoundsException if the parameter <code>offset</code> would
838    * move the read position out of the available text range (text window)
839    * @see #setReadPositionAbsolute
840    */

841   public void setReadPositionRelative(int offset) throws IndexOutOfBoundsException JavaDoc {
842     setReadPositionAbsolute(getReadPosition() + offset);
843   }
844
845   /**
846    * Closing this tokenizer frees resources and deregisters from the
847    * associated {@link TokenizerProperties} object.
848    */

849   public void close() {
850     // deregister from the properties
851
if (_properties != null) {
852       _properties.removeTokenizerPropertyListener(this);
853       _properties = null;
854     }
855
856     // freeing memory
857
if (_position2LineMap != null) {
858       _position2LineMap.clear();
859       _position2LineMap = null;
860     }
861     
862     // adjust members
863
_eofReached = true;
864     _flags = 0;
865     _flagMask = 0;
866     _internalFlags = 0;
867     _currentReadPos = 0;
868     _currentWritePos = 0;
869     _lineNumber = -1;
870     _columnNumber = -1;
871     _nextTokenizer = null;
872     _prevTokenizer = null;
873     _whitespaceHandler = null;
874     _separatorHandler = null;
875     _keywordHandler = null;
876     _sequenceHandler = null;
877     _patternHandler = null;
878     _source = null;
879     Arrays.fill(_scannedToken, null);
880   }
881
882   
883   //---------------------------------------------------------------------------
884
// embedded tokenizer support
885
//
886

887   /**
888    * Adding an embedded tokenizer. Embedded tokenizer work on the same input
889    * buffer as their base tokenizer. A situation where embedded tokenizer could
890    * be applied, is a HTML stream with cascading style sheet (CSS) and JavaScript
891    * parts.
892    *<br>
893    * There are no internal means of switching from one tokenizer to another.
894    * This should be done by the caller using the method {@link #switchTo}.
895    *<br>
896    * The {@link TokenizerProperties#F_KEEP_DATA} and {@link TokenizerProperties#F_COUNT_LINES}
897    * flags of the base tokenizer take effect also in the embedded tokenizers.
898    *<br>
899    * Since is might be possible that the given <code>tokenizer</code> is a
900    * derivation of the <code>AbstractTokenizer</code> class, this method is
901    * synchronized on <code>tokenizer</code>.
902    *
903    * @param tokenizer an embedded tokenizer
904    * @throws TokenizerException if something goes wrong (not likely :-)
905    */

906   public void addTokenizer(AbstractTokenizer tokenizer) throws TokenizerException {
907     AbstractTokenizer curr = this;
908     
909     while (curr._nextTokenizer != null) {
910       curr = curr._nextTokenizer;
911     }
912     
913     if (tokenizer != null) {
914       synchronized(tokenizer) {
915         curr._nextTokenizer = tokenizer;
916         tokenizer._prevTokenizer = curr;
917
918         // share the input buffer of the base tokenizer
919
AbstractTokenizer baseTokenizer = getBaseTokenizer();
920         
921         tokenizer._baseTokenizer = baseTokenizer;
922         
923         // inherited flags
924
tokenizer.changeParseFlags(baseTokenizer.getParseFlags(), Flags.F_COUNT_LINES);
925       }
926     }
927   }
928
929   /**
930    * Changing fron one tokenizer to another. If the given tokenizer has not been
931    * added with {@link #addTokenizer}, an exception is thrown.<br>
932    * The <code>switchTo</code> method does the nessecary synchronisation between
933    * <code>this</code> and the given tokenizer. The user is therefore responsible
934    * to use <code>switchTo</code> whenever a tokenizer change is nessecary. It
935    * must be done this way:
936    *<blockquote><pre>
937    * Tokenizer base = new MyTokenizer(...)
938    * Tokenizer embedded = new MyTokenizer(...)
939    *
940    * // setting properties (comments, keywords etc.)
941    * ...
942    *
943    * // embedding a tokenizer
944    * base.addTokenizer(embedded);
945    *
946    * // tokenizing with base
947    * ...
948    * if (<i>switch_condition</i>) {
949    * base.switchTo(embedded);
950    * }
951    *
952    * // tokenizing with embedded
953    * ...
954    * if (<i>switch_condition</i>) {
955    * embedded.switchTo(base);
956    * }
957    *</pre></blockquote>
958    * That way we avoid a more complex synchronisation between tokenizers whenever
959    * one of them parses the next data in the input stream. However, the danger
960    * of not synchronized tokenizers remains, so take care.
961    *<br>
962    * Since is might be possible that the given <code>tokenizer</code> is a
963    * derivation of the <code>AbstractTokenizer</code> class, this method is
964    * synchronized on <code>tokenizer</code>.
965    *
966    * @param tokenizer the tokenizer that should be used from now on
967    */

968   public void switchTo(AbstractTokenizer tokenizer) throws TokenizerException {
969     if (tokenizer != null) {
970       synchronized(tokenizer) {
971         if (tokenizer._baseTokenizer != _baseTokenizer) {
972           throw new TokenizerException("Trying to switch to an alien tokenizer (not added with addTokenizer).", null);
973         }
974         tokenizer._eofReached = this._eofReached;
975         tokenizer._currentReadPos = this._currentReadPos;
976         tokenizer._currentWritePos = this._currentWritePos;
977         tokenizer._columnNumber = this._columnNumber;
978         tokenizer._lineNumber = this._lineNumber;
979         tokenizer._position2LineMap = this._position2LineMap;
980       }
981     } else {
982       throw new TokenizerException(new NullPointerException JavaDoc());
983     }
984   }
985
986
987   //---------------------------------------------------------------------------
988
// Methods that may be overwritten in derived classes
989
//
990

991   /**
992    * This method checks if the character is a whitespace. Implement Your own
993    * code for situations where this default implementation is not fast enough
994    * or otherwise not really good.
995    *
996    * @param testChar check this character
997    * @return <code>true</code> if the given character is a whitespace,
998    * <code>false</code> otherwise
999    */

1000  protected boolean isWhitespace(char testChar) {
1001    if (_whitespaceHandler != null) {
1002      return _whitespaceHandler.isWhitespace(testChar);
1003    } else {
1004      return false;
1005    }
1006  }
1007      
1008  /**
1009   * This method detects the number of whitespace characters starting at the given
1010   * position. It should return the number of characters identified as whitespaces
1011   * starting from and including the given start position.
1012   *<br>
1013   * Then overriding this method, use {@link #getBaseDataProvider} to access characters.
1014   *<br>
1015   * Do not attempt to actually read more data or do anything that leads to the
1016   * change of the data source or to tokenizer switching. This is done by the
1017   * tokenizer framework.
1018   *
1019   * @param startingAtPos start checking for whitespace from this position
1020   * @param maxChars if there is no non-whitespace character, read up to this number of characters
1021   * @return number of whitespace characters starting from the given offset
1022   * @throws TokenizerException failure while reading data from the input stream
1023   */

1024  protected int readWhitespaces(int startingAtPos, int maxChars) throws TokenizerException {
1025    if (_whitespaceHandler != null) {
1026      DataProvider dataProvider = getBaseDataProvider(startingAtPos, maxChars);
1027      return _whitespaceHandler.countLeadingWhitespaces(dataProvider);
1028    } else {
1029      return 0;
1030    }
1031  }
1032  
1033  /**
1034   * This method checks if the character sequence starting at a given position
1035   * with a given lenghth is a keyword. If so, it returns the keyword description
1036   * as {@link TokenizerProperty} object.
1037   *
1038   * @param startingAtPos check at this position
1039   * @param length the candidate has this number of characters
1040   * @throws TokenizerException routed exception from the active {@link de.susebox.jtopas.spi.KeywordHandler}
1041   * @return {@link TokenizerProperty} describing the keyword or <code>null</code>
1042   */

1043  protected TokenizerProperty isKeyword(int startingAtPos, int length) throws TokenizerException {
1044    if (_keywordHandler != null) {
1045      DataProvider dataProvider = getBaseDataProvider(startingAtPos, length);
1046      return _keywordHandler.isKeyword(dataProvider);
1047    } else {
1048      return null;
1049    }
1050  }
1051  
1052  
1053  //---------------------------------------------------------------------------
1054
// TokenizerPropertyListener methods
1055
//
1056

1057  /**
1058   * Splits a given String into lines. The method ist used to retrieve the
1059   * image parts of several token types.
1060   *
1061   * @param image split this string into lines
1062   * @return an array containing the lines of the image without line separator
1063   * characters
1064   */

1065  protected String JavaDoc[] splitIntoLines(String JavaDoc image) {
1066    LinkedList JavaDoc lines = new LinkedList JavaDoc();
1067    int index = 0;
1068    int start = 0;
1069    
1070    while (index < image.length()) {
1071      switch (image.charAt(index)) {
1072      case '\r':
1073        lines.add(image.substring(start, index));
1074        if (index + 1 < image.length() && image.charAt(index + 1) == '\n') {
1075          index += 2;
1076        } else {
1077          index++;
1078        }
1079        start = index;
1080        break;
1081      case '\n':
1082        lines.add(image.substring(start, index));
1083        start = ++index;
1084        break;
1085      default:
1086        index++;
1087      }
1088    }
1089    
1090    if (start < index || start > 0) {
1091      lines.add(image.substring(start, index));
1092    }
1093  
1094    return (String JavaDoc[])lines.toArray(new String JavaDoc[lines.size()]);
1095  }
1096  
1097  /**
1098   * Splits a given string into lines and removing string escapes. The method is
1099   * used to retrieve the image parts for string token types.
1100   *
1101   * @param prop the {@link TokenizerProperty} describing a string
1102   * @param image split this string into lines
1103   * @return an array containing the lines of the image without line separator
1104   * characters
1105   */

1106  protected String JavaDoc[] splitString(TokenizerProperty prop, String JavaDoc image) {
1107    // complete string
1108
String JavaDoc[] images = prop.getImages();
1109    String JavaDoc begin = images[0];
1110    String JavaDoc end = images[1];
1111    String JavaDoc esc = images[2];
1112    boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1113    boolean escEqualsEnd = ( ! noCase && esc.compareTo(end) == 0)
1114                                  || ( noCase && esc.compareToIgnoreCase(end) == 0);
1115
1116    StringBuffer JavaDoc buffer = null;
1117    int index = begin.length();
1118    int start = index;
1119    int endIndex;
1120    
1121    if ( image.length() - start >= end.length()
1122        && ( ( ! noCase && end.equals(image.substring(image.length() - end.length())))
1123            || ( noCase && end.equalsIgnoreCase(image.substring(image.length() - end.length()))))) {
1124      endIndex = image.length() - end.length();
1125    } else {
1126      endIndex = image.length();
1127    }
1128    
1129    while (index < endIndex) {
1130      if ( ( ! noCase && image.startsWith(esc, index))
1131          || ( noCase && image.substring(index, index + esc.length()).equalsIgnoreCase(esc))) {
1132        if (buffer == null) {
1133          buffer = new StringBuffer JavaDoc(image.length());
1134        }
1135        buffer.append(image.substring(start, index));
1136        index += esc.length();
1137        if (index < image.length()) {
1138          if ( ( ! noCase && image.startsWith(esc, index))
1139              || ( noCase && image.substring(index, index + esc.length()).equalsIgnoreCase(esc))) {
1140            buffer.append(esc);
1141            index += esc.length();
1142          } else if ( ( ! noCase && image.startsWith(begin, index))
1143                     || ( noCase && image.substring(index, index + begin.length()).equalsIgnoreCase(begin))) {
1144            buffer.append(begin);
1145            index += begin.length();
1146          } else if ( ( ! noCase && image.startsWith(end, index))
1147                     || ( noCase && image.substring(index, index + end.length()).equalsIgnoreCase(end))) {
1148            buffer.append(end);
1149            index += end.length();
1150          }
1151        }
1152        start = index;
1153      }
1154      index++;
1155    }
1156    
1157    if (buffer != null && start < index) {
1158      buffer.append(image.substring(start, endIndex));
1159    }
1160  
1161    return splitIntoLines((buffer != null) ? buffer.toString() : image.substring(start, endIndex));
1162  }
1163  
1164  /**
1165   * Splits a given block comment into lines. The method is used to retrieve the
1166   * image parts for block comment token types.
1167   *
1168   * @param prop the {@link TokenizerProperty} describing a block comment
1169   * @param image split this string into lines
1170   * @return an array containing the lines of the image without line separator
1171   * characters
1172   */

1173  protected String JavaDoc[] splitBlockComment(TokenizerProperty prop, String JavaDoc image) {
1174    // complete string
1175
String JavaDoc[] images = prop.getImages();
1176    String JavaDoc start = images[0];
1177    String JavaDoc end = images[1];
1178    boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1179
1180    if ( image.length() - start.length() >= end.length()
1181        && ( ( ! noCase && end.equals(image.substring(image.length() - end.length())))
1182            || ( noCase && end.equalsIgnoreCase(image.substring(image.length() - end.length()))))) {
1183      return splitIntoLines(image.substring(start.length(), image.length() - end.length()));
1184    } else {
1185      return splitIntoLines(image.substring(start.length()));
1186    }
1187  }
1188  
1189  /**
1190   * Event handler method. The given {@link TokenizerPropertyEvent} parameter
1191   * contains the nessecary information about the property change. We choose
1192   * one single method in favour of various more specialized methods since the
1193   * reactions on adding, removing and modifying tokenizer properties are often
1194   * the same (flushing cash, rereading information etc.) are probably not very
1195   * different.
1196   *<br>
1197   * Note that a modification of the parse flags in the backing {@link TokenizerProperties}
1198   * object removes all flags previously modified through {@link #changeParseFlags}.
1199   *
1200   * @param event the {@link TokenizerPropertyEvent} that describes the change
1201   */

1202  public void propertyChanged(TokenizerPropertyEvent event) {
1203    TokenizerProperty prop = event.getProperty();
1204    String JavaDoc[] images = prop.getImages();
1205    
1206    synchronized(this) {
1207      switch (event.getType()) {
1208      case TokenizerPropertyEvent.PROPERTY_ADDED:
1209      case TokenizerPropertyEvent.PROPERTY_REMOVED:
1210        switch (prop.getType()) {
1211        case Token.LINE_COMMENT:
1212        case Token.BLOCK_COMMENT:
1213        case Token.STRING:
1214        case Token.SPECIAL_SEQUENCE:
1215          if ( (_internalFlags & IFLAG_EXTERNAL_SEQUENCE_HANDLER) == 0
1216              && _properties instanceof de.susebox.jtopas.spi.SequenceHandler) {
1217            setSequenceHandler((de.susebox.jtopas.spi.SequenceHandler)_properties);
1218          }
1219          break;
1220        case Token.KEYWORD:
1221          if ( (_internalFlags & IFLAG_EXTERNAL_KEYWORD_HANDLER) == 0
1222              && _properties instanceof de.susebox.jtopas.spi.KeywordHandler) {
1223            setKeywordHandler((de.susebox.jtopas.spi.KeywordHandler)_properties);
1224          }
1225          break;
1226        case Token.PATTERN:
1227          if ( (_internalFlags & IFLAG_EXTERNAL_PATTERN_HANDLER) == 0
1228              && _properties instanceof de.susebox.jtopas.spi.PatternHandler) {
1229            setPatternHandler((de.susebox.jtopas.spi.PatternHandler)_properties);
1230          }
1231          break;
1232        }
1233        break;
1234        
1235      case TokenizerPropertyEvent.PROPERTY_MODIFIED:
1236        switch (prop.getType()) {
1237        case TokenizerProperty.PARSE_FLAG_MASK:
1238          _flags = getTokenizerProperties().getParseFlags();
1239          _flagMask = 0;
1240          if (isFlagSet(Flags.F_COUNT_LINES)) {
1241            if (_lineNumber < 0) {
1242              if (_position2LineMap != null) {
1243                _position2LineMap.clear();
1244              }
1245              _lineNumber = 0;
1246              putPosition(_currentReadPos, _lineNumber);
1247            }
1248            if (_columnNumber < 0) {
1249              _columnNumber = 0;
1250            }
1251          } else {
1252            _lineNumber = -1;
1253            _columnNumber = -1;
1254          }
1255          break;
1256        }
1257        break;
1258      }
1259    }
1260  }
1261  
1262
1263  //---------------------------------------------------------------------------
1264
// Implementation
1265
//
1266

1267  /**
1268   * Embedded tokenizers have their base tokenizer they share the input stream
1269   * with.
1270   *
1271   * @return the base tokenizer (the one owning the input stream and text buffer)
1272   */

1273  protected AbstractTokenizer getBaseTokenizer() {
1274    return _baseTokenizer;
1275  }
1276  
1277  /**
1278   * Returns the {@link de.susebox.jtopas.spi.DataProvider} of the base tokenizer.
1279   * This is this tokenizer if it is not an embedded one.
1280   *
1281   * @param startPos position in the input data
1282   * @param length number of characters
1283   * @return the <code>DataProvider</code> for the given data range
1284   */

1285  protected DataProvider getBaseDataProvider(int startPos, int length) {
1286    return getBaseTokenizer().getDataProvider(startPos, length);
1287  }
1288
1289  /**
1290   * This method organizes the input buffer. It moves the current text window if
1291   * nessecary or allocates more space, if data should be kept completely (see the
1292   * {@link TokenizerProperties#F_KEEP_DATA} flag).
1293   * Its main purpose is to call the {@link TokenizerSource#read} method.
1294   *
1295   * @return number of read bytes or -1 if an end-of-file condition occured
1296   * @throws TokenizerException wrapped exceptions from the {@link TokenizerSource#read}
1297   * method
1298   */

1299  protected int readMoreDataFromBase() throws TokenizerException {
1300    // its always the base tokenizer doing the reading
1301
int readChars = -1;
1302    
1303    if ( ! _eofReached) {
1304      AbstractTokenizer baseTokenizer = getBaseTokenizer();
1305
1306      if (baseTokenizer != this) {
1307        readChars = baseTokenizer.readMoreData();
1308      } else {
1309        readChars = readMoreData();
1310      }
1311      if (readChars > 0) {
1312        _currentWritePos += readChars;
1313      } else if (readChars < 0) {
1314        readChars = -1;
1315        _eofReached = true;
1316      }
1317
1318      // Inform all embedded tokenizers about input buffer changes
1319
synchronizeAll();
1320    }
1321    return readChars;
1322  }
1323  
1324  /**
1325   * When the method {@link #readMoreData} changes the contents of the input buffer
1326   * or the input buffer itself, all embedded tokenizers must be synchronized.
1327   * That means their member variables are adjusted to the base tokenizer.
1328   *
1329   * @throws TokenizerException if something goes wrong
1330   */

1331  protected void synchronizeAll() throws TokenizerException {
1332    AbstractTokenizer embedded = getBaseTokenizer();
1333
1334    while ((embedded = embedded._nextTokenizer) != null) {
1335      switchTo(embedded); // adjust the member variables
1336
}
1337  }
1338
1339  /**
1340   * Checks the EOF condition at the given offset.
1341   *
1342   * @param offset check at this position relative to the current read position
1343   * @return <code>true</code> if EOF has been reached, <code>false</code> otherwise
1344   * @throws TokenizerException failure while reading data from the input stream
1345   */

1346  protected boolean isEOF(int offset) throws TokenizerException {
1347    if (_currentReadPos + offset < _currentWritePos || readMoreDataFromBase() > 0) {
1348      return false;
1349    } else {
1350      _scannedToken[1] = new Token(Token.EOF);
1351      return true;
1352    }
1353  }
1354    
1355  /**
1356   * The number of characters until the next comment, whitespace, string, special
1357   * sequence or separator are determined. The character sequnce is then checked
1358   * for keyword or pattern matching.
1359   *
1360   * @param token buffer to receive information about the keyword or normal token
1361   * @return <code>null</code> or a {@link TokenizerProperty} if a keyword or pattern is found
1362   * @throws TokenizerException failure while reading data from the input stream
1363   */

1364  protected TokenizerProperty completeBoundedToken(Token token) throws TokenizerException {
1365    // find out the return value (length of normal token)
1366
int len = 1; // the first character is a normal one, see call of this method
1367

1368    while ( ! ( isEOF(len)
1369               || isWhitespace(len)
1370               || isPattern(len, true)
1371               || isSpecialSequence(len)
1372               || isSeparator(len))) {
1373      len++;
1374    }
1375    token.setLength(len);
1376    
1377    // test on keyword or non-free pattern
1378
TokenizerProperty prop = null;
1379    PatternHandler.Result result;
1380    
1381    if ((prop = isKeyword(_currentReadPos, len)) != null) {
1382      token.setType(Token.KEYWORD);
1383      token.setCompanion(prop.getCompanion());
1384    } else {
1385      token.setType(Token.NORMAL);
1386    }
1387    return prop;
1388  }
1389  
1390  /**
1391   * After having identified a whitespace, this method continues to read data
1392   * until it detects a non-whitespace.
1393   *
1394   * @return number of consecutive whitespaces
1395   * @throws TokenizerException failure while reading data from the input stream
1396   */

1397  protected int completeWhitespace() throws TokenizerException {
1398    int start = _currentReadPos + 1; // the first whitespace we have already
1399
int available = _currentWritePos - start;
1400    int len = readWhitespaces(start, available);
1401    
1402    while (len == available) {
1403      if (readMoreDataFromBase() <= 0) {
1404        break;
1405      }
1406      start += len;
1407      available = _currentWritePos - start;
1408      len += readWhitespaces(start, available);
1409    }
1410    return len + 1; // the first whitespace we had already
1411
}
1412  
1413  /**
1414   * This method checks at the given offset if it is a whitespace.
1415   *
1416   * @param offset check at this position relative to the current read position
1417   * @throws TokenizerException failure while reading data from the input stream
1418   * @return <code>true</code> if a whitespace sequence was found at the given offset,
1419   * <code>false</code> otherwise
1420   */

1421  protected boolean isWhitespace(int offset) throws TokenizerException {
1422    if (_whitespaceHandler != null) {
1423      if (_currentReadPos + offset >= _currentWritePos && readMoreDataFromBase() < 0) {
1424        return false;
1425      }
1426    
1427      if (isWhitespace(getChar(_currentReadPos + offset))) {
1428        _scannedToken[1] = new Token(Token.WHITESPACE);
1429        return true;
1430      }
1431    }
1432    return false;
1433  }
1434      
1435  /**
1436   * This method checks at the given offset if it contains a separator.
1437   *
1438   * @param offset check at this position relative to the current read position
1439   * @throws TokenizerException failure while reading data from the input stream
1440   * @return <code>true</code> if a separator was found atthe given offset,
1441   * <code>false</code> otherwise
1442   */

1443  protected boolean isSeparator(int offset) throws TokenizerException {
1444    if ( _separatorHandler != null
1445        && (_currentReadPos + offset < _currentWritePos || readMoreDataFromBase() > 0)
1446        && _separatorHandler.isSeparator(getChar(_currentReadPos + offset))) {
1447      _scannedToken[1] = new Token(Token.SEPARATOR);
1448      return true;
1449    } else {
1450      return false;
1451    }
1452  }
1453  
1454  /**
1455   * Testing for pattern matching.
1456   *
1457   * @param offset check at this position relative to the current read position
1458   * @param freePatternOnly if <code>true</code> consider only pattern that can occur anywhere in the data
1459   * @throws TokenizerException failure while reading data from the input stream
1460   * @return <code>true</code> if a pattern match was found at the given offset,
1461   * <code>false</code> otherwise
1462   */

1463  protected boolean isPattern(int offset, boolean freePatternOnly) throws TokenizerException {
1464    if (_patternHandler != null) {
1465      // for pattern, we might need a lot of data
1466
int startingAtPos = _currentReadPos + offset;
1467      
1468      while (_currentWritePos - startingAtPos < PATTERN_MAX_SIZE) {
1469        if (readMoreDataFromBase() <= 0) {
1470          break;
1471        }
1472      }
1473
1474      // try pattern matching
1475
DataProvider dataProvider = getBaseDataProvider(startingAtPos, _currentWritePos - startingAtPos);
1476      PatternHandler.Result result = _patternHandler.matches(dataProvider);
1477      boolean isFree = (result != null) ? isFlagSet(result.getProperty(), Flags.F_FREE_PATTERN) : false;
1478      
1479      if (result != null && (isFree || ! freePatternOnly)) {
1480        if ( ! isFree) {
1481          int nextOffset = offset + result.getLengthOfMatch();
1482          
1483          if ( isEOF(nextOffset)
1484              || isWhitespace(nextOffset)
1485              || isPattern(nextOffset, true)
1486              || isSpecialSequence(nextOffset)
1487              || isSeparator(nextOffset)) {
1488            _scannedToken[2] = _scannedToken[1];
1489          } else {
1490            return false;
1491          }
1492        }
1493        _scannedToken[1] = new Token(Token.PATTERN, null, result.getProperty());
1494        _scannedToken[1].setLength(result.getLengthOfMatch());
1495        if (isFlagSet(result.getProperty(), Flags.F_RETURN_IMAGE_PARTS)) {
1496          _scannedToken[1].setImageParts(result.getGroups());
1497        }
1498        return true;
1499      }
1500    }
1501    
1502    // no pattern matching available or no match found
1503
return false;
1504  }
1505  
1506  /**
1507   * This method checks at the given offset if it contains a a special sequence.
1508   * Unlike the method {@link #test4SpecialSequence} it does nothing more.
1509   *
1510   * @param offset check at this position relative to the current read position
1511   * @throws TokenizerException failure while reading data from the input stream
1512   * @return <code>true</code> if a special sequence was found at the given offset,
1513   * <code>false</code> otherwise
1514   */

1515  protected boolean isSpecialSequence(int offset) throws TokenizerException {
1516    if (_sequenceHandler != null) {
1517      // do we need more data to ensure enough characters for even the longest
1518
// possible sequence match
1519
int startingAtPos = _currentReadPos + offset;
1520    
1521      while (_sequenceHandler.getSequenceMaxLength() > _currentWritePos - startingAtPos) {
1522        if (readMoreDataFromBase() <= 0) {
1523          break;
1524        }
1525      }
1526      
1527      // invoke the sequence handler
1528
DataProvider dataProvider = getBaseDataProvider(startingAtPos, _currentWritePos - startingAtPos);
1529      TokenizerProperty prop = _sequenceHandler.startsWithSequenceCommentOrString(dataProvider);
1530    
1531      if (prop != null) {
1532        _scannedToken[1] = new Token(prop.getType(), null, prop);
1533        return true;
1534      }
1535    }
1536    
1537    // no sequence handler given or no special sequence at given offset
1538
return false;
1539  }
1540  
1541  /**
1542   * Completing a line comment. After a line comment sequence has been found, all
1543   * characters up to and including the end-of-line combination belong to the
1544   * line comment. Note that on reaching end-of-file a line comment does not
1545   * nessecarily ends with an end-of-line sequence (linefeed for example).
1546   *
1547   * @param prop the property describing the line comment to complete
1548   * @return length of the line comment
1549   * @throws TokenizerException failure while reading data from the input stream
1550   */

1551  protected int completeLineComment(TokenizerProperty prop) throws TokenizerException {
1552    String JavaDoc[] images = prop.getImages();
1553    int len = images[0].length();
1554
1555    while (_currentReadPos + len < _currentWritePos || readMoreDataFromBase() > 0) {
1556      switch (getChar(_currentReadPos + len)) {
1557      case '\r':
1558        len++;
1559        if (_currentReadPos + len < _currentWritePos || readMoreDataFromBase() > 0) {
1560          if (getChar(_currentReadPos + len) == '\n') {
1561            len++;
1562          }
1563        }
1564        return len;
1565      case '\n':
1566        len++;
1567        return len;
1568      default:
1569        len++;
1570      }
1571    }
1572    return len;
1573  }
1574  
1575  /**
1576   * Completing a block comment. After a block comment sequence has been found, all
1577   * characters up to and including the end sequence of the block comment belong
1578   * to the block comment. Note that on reaching end-of-file a block comment does
1579   * not nessecarily ends with an end-of-block-comment sequence.
1580   *
1581   * @param prop the property describing the block comment to complete
1582   * @return length of the block comment
1583   * @throws TokenizerException failure while reading data from the input stream
1584   */

1585  protected int completeBlockComment(TokenizerProperty prop) throws TokenizerException {
1586    String JavaDoc[] images = prop.getImages();
1587    String JavaDoc start = images[0];
1588    String JavaDoc end = images[1];
1589    boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1590    boolean nested = isFlagSet(prop, Flags.F_ALLOW_NESTED_COMMENTS);
1591    int len = start.length();
1592    int level = 0;
1593
1594    __LOOP__:
1595    do {
1596      // test on nested comments: we take only care for nesting the same
1597
// block comment
1598
if (nested) {
1599        switch (comparePrefix(len, start, noCase)) {
1600        case 0: // comment start identified
1601
level++;
1602          len += start.length();
1603          continue __LOOP__;
1604        case -1: // EOF reached
1605
return _currentWritePos - _currentReadPos;
1606        }
1607      }
1608      
1609      // is it the end ?
1610
switch (comparePrefix(len, end, noCase)) {
1611      case 0: // comment end identified
1612
level--;
1613        len += end.length();
1614        break;
1615      case -1: // EOF reached
1616
return _currentWritePos - _currentReadPos;
1617      default:
1618        len++;
1619      }
1620    } while (level >= 0);
1621
1622    // block comment regularly terminated
1623
return len;
1624  }
1625  
1626  /**
1627   * Completing a string. After a string start sequence has been found, all
1628   * characters up to and including the end-of-string sequence belong to the
1629   * string. Note that on reaching end-of-file a string does not nessecarily ends
1630   * with an end-of-string sequence.
1631   *
1632   * @param prop the property describing the string to complete
1633   * @return length of the string
1634   * @throws TokenizerException failure while reading data from the input stream
1635   */

1636  protected int completeString(TokenizerProperty prop) throws TokenizerException {
1637    // complete string
1638
String JavaDoc[] images = prop.getImages();
1639    String JavaDoc start = images[0];
1640    String JavaDoc end = images[1];
1641    String JavaDoc esc = images[2];
1642    int len = start.length();
1643    boolean noCase = isFlagSet(prop, Flags.F_NO_CASE);
1644    boolean escEqualsEnd = ( ! noCase && esc.compareTo(end) == 0)
1645                              || ( noCase && esc.compareToIgnoreCase(end) == 0);
1646
1647    while (true) {
1648      // test on escape
1649
if (esc != null) {
1650        switch (comparePrefix(len, esc, noCase)) {
1651        case 0: // escape found
1652
len += esc.length();
1653          if (escEqualsEnd) {
1654            switch (comparePrefix(len, end, noCase)) {
1655            case 0:
1656              len += end.length();
1657              break;
1658            case -1: // EOF reached
1659
return _currentWritePos - _currentReadPos;
1660            default: // this is the regular return point if the esc is the string end
1661
return len;
1662            }
1663          } else {
1664            len++; // esc != string end: skip the next character
1665
}
1666          continue;
1667        case -1: // EOF reached
1668
return _currentWritePos - _currentReadPos;
1669        }
1670      }
1671
1672      // test on end sequence
1673
switch (comparePrefix(len, end, noCase)) {
1674      case 0: // this is the regular return point if esc != string end
1675
len += end.length();
1676        return len;
1677      case -1: // EOF reached
1678
return _currentWritePos - _currentReadPos;
1679      default:
1680        len++;
1681      }
1682    }
1683  }
1684
1685  /**
1686   * This method compares the characters at the given offset (from the current
1687   * read position) with the given prefix.
1688   *
1689   * @param offset start comparing at this offset from the current read position
1690   * @param prefic compare read data with this prefix
1691   * @param noCase case- or not case-sensitive comparison
1692   * @throws TokenizerException failure while reading data from the input stream
1693   * @return 0 if the the given prefix matches the input stream, -1 on EOF and
1694   * 1 if not matching
1695   */

1696  protected int comparePrefix(int offset, String JavaDoc prefix, boolean noCase)
1697    throws TokenizerException
1698  {
1699    // compare
1700
int len = prefix.length();
1701    
1702    for (int pos = offset; pos < offset + len; ++pos) {
1703      // do we have enough data
1704
if (_currentReadPos + pos >= _currentWritePos && readMoreDataFromBase() < 0) {
1705        return -1;
1706      }
1707      
1708      // compare single character
1709
char c1 = prefix.charAt(pos - offset);
1710      char c2 = getChar(_currentReadPos + pos);
1711      
1712      if ( c1 != c2
1713          && (! noCase || Character.toUpperCase(c1) != Character.toUpperCase(c2))) {
1714        return 1;
1715      }
1716    }
1717    
1718    // found
1719
return 0;
1720  }
1721  
1722  /**
1723   * The method recomputes the line and column position of the tokenizer, if the
1724   * flag {@link TokenizerProperties#F_COUNT_LINES} is set. It gets the token type of the
1725   * {@link Token} that has been retrieved by the calling {@link #nextToken}.
1726   * Using the tokenizer control flags and certain other information it tries to
1727   * to find end-of-line sequences as fast as possible. For example, a line
1728   * comment should always contain a end-of-line sequence, so we can simply
1729   * increase the line count and set the column count to 0.
1730   *
1731   * @param type the type of the current token
1732   * @param length the length of the current token
1733   */

1734  protected void adjustLineAndColumn(int type, int length) {
1735    // line and column counting not required
1736
if ( ! isFlagSet(Flags.F_COUNT_LINES)) {
1737      return;
1738    }
1739    
1740    // there might be a simple way to determine the current line and column position
1741
switch (type) {
1742    case Token.EOF:
1743      return;
1744        
1745    case Token.LINE_COMMENT: // a line comment always ends with a newline
1746
_lineNumber++;
1747      _columnNumber = 0;
1748      putPosition(_currentReadPos + length, _lineNumber);
1749      return;
1750      
1751    case Token.SPECIAL_SEQUENCE:
1752    case Token.SEPARATOR:
1753    case Token.NORMAL:
1754    case Token.KEYWORD:
1755      if (_whitespaceHandler != null && _whitespaceHandler.newlineIsWhitespace()) { // newline is a whitespace character
1756
_columnNumber += length; // it should therefore not occure in other
1757
return; // tokens
1758
}
1759      break;
1760        
1761    case Token.WHITESPACE:
1762      if ( ! (_whitespaceHandler.isWhitespace('\n') || _whitespaceHandler.isWhitespace('\r'))) {
1763        _columnNumber += length; // newline is not a whitespace; we do not have
1764
return; // to test for it in the current token
1765
}
1766      break;
1767    }
1768    
1769    // count it
1770
int newLineNumber = _lineNumber;
1771    
1772    for (int pos = _currentReadPos; pos < _currentReadPos + length; ++pos) {
1773      switch (getChar(pos)) {
1774      case '\r':
1775        if (pos + 1 >= _currentReadPos + length || getChar(pos + 1) != '\n') {
1776          _lineNumber++;
1777          _columnNumber = 0;
1778          putPosition(pos + 1, _lineNumber);
1779          break;
1780        }
1781        pos++;
1782        /* no break; */
1783      case '\n':
1784        _lineNumber++;
1785        _columnNumber = 0;
1786        putPosition(pos + 1, _lineNumber);
1787        break;
1788        
1789      default:
1790        _columnNumber++;
1791      }
1792    }
1793  }
1794  
1795  /**
1796   * Putting a new position into the position-to-line-number map.
1797   *
1798   * @param position the position to map to the current line number
1799   */

1800  private void putPosition(int position, int lineNumber) {
1801    if (_position2LineMap == null) {
1802      _position2LineMap = new TreeMap JavaDoc();
1803    }
1804    _position2LineMap.put(new Integer JavaDoc(position), new Integer JavaDoc(lineNumber));
1805  }
1806  
1807  /**
1808   * Checking a given flag. The method considers both the globally set flags
1809   * in the associated {@link TokenizerProperties} instance and the locally set
1810   * by {@link #changeParseFlags}.
1811   *
1812   * @param flag one of the <code>F_...</code> flags defined in {@link TokenizerProperties}
1813   */

1814  protected boolean isFlagSet(int flag) {
1815    return (getParseFlags() & flag) != 0;
1816  }
1817  
1818  /**
1819   * Checking if a given flag is set for the given {@link TokenizerProperty}, for
1820   * this <code>Tokenizer</code> or for the used {@link TokenizerProperties}. The method considers both the globally set flags
1821   * in the associated {@link TokenizerProperties} instance and the locally set
1822   * by {@link #changeParseFlags}.
1823   *
1824   * @param prop check the flag for this property
1825   * @param flag one of the {@link Flags} constants
1826   */

1827  protected boolean isFlagSet(TokenizerProperty prop, int flag) {
1828    return prop.isFlagSet(flag, (getTokenizerProperties().getParseFlags() & flag) != 0 || isFlagSet(flag));
1829  }
1830  
1831  
1832  //---------------------------------------------------------------------------
1833
// Class members
1834
//
1835

1836  /**
1837   * mask of flags that can be set separately for a <code>AbstractTokenizer</code>.
1838   */

1839  protected static final int VALID_FLAGS_MASK =
1840      Flags.F_RETURN_WHITESPACES
1841    | Flags.F_TOKEN_POS_ONLY
1842    | Flags.F_KEEP_DATA
1843    | Flags.F_COUNT_LINES;
1844  
1845  /**
1846   * {@link TokenizerProperties} tha tare used if no others have been
1847   * specified by calling {@link #setTokenizerProperties}.
1848   */

1849  protected StandardTokenizerProperties _defaultProperties = null;
1850  
1851  /**
1852   * Buffer sizes
1853   */

1854  private static final int PATTERN_MAX_SIZE = 0x40000; // 256K
1855

1856  /**
1857   * Bits for the internal flag bitmask
1858   */

1859  private static final byte IFLAG_EXTERNAL_PATTERN_HANDLER = 0x01;
1860  private static final byte IFLAG_EXTERNAL_KEYWORD_HANDLER = 0x02;
1861  private static final byte IFLAG_EXTERNAL_SEQUENCE_HANDLER = 0x04;
1862
1863  
1864  //---------------------------------------------------------------------------
1865
// Members
1866
//
1867

1868  /**
1869   * overall tokenizer flags.
1870   */

1871  protected int _flags = 0;
1872  
1873  /**
1874   * a combination of <code>F_...</code> constants defined in {@link TokenizerProperties}
1875   * indicating which bits in the {@link #_flags} member are valid. All other
1876   * flags are taken from the associated {@link TokenizerProperties} object.
1877   *
1878   * @see #changeParseFlags
1879   */

1880  private int _flagMask = 0;
1881  
1882  /**
1883   * Flag if EOF has been reached. The flag should speed up calls to {@link readMoreDataFromBase}
1884   */

1885  private boolean _eofReached = true;
1886  
1887  /**
1888   * Data index there {@link #nextToken} will start parsing.
1889   */

1890  protected int _currentReadPos = 0;
1891
1892  /**
1893   * Data index there {@link #readMoreDataFromBase} will fill in new data.
1894   */

1895  protected int _currentWritePos = 0;
1896  
1897  /**
1898   * if line counting is enabled, this contains the current line number starting
1899   * with 0.
1900   */

1901  protected int _lineNumber = -1;
1902
1903  /**
1904   * if line counting is enabled, this contains the current column number starting
1905   * with 0.
1906   */

1907  protected int _columnNumber = -1;
1908  
1909  /**
1910   * List of currently known token. The first element is the current token returned
1911   * by the last call to {@link #nextToken}. The following elements are look-ahead
1912   * token that have already been identified when extracting the current token.
1913   */

1914  protected Token[] _scannedToken = new Token[] { null, null, null };
1915  
1916  /**
1917   * For embedded tokenizers: this is the list of the succeding tokenizers
1918   */

1919  protected AbstractTokenizer _nextTokenizer = null;
1920
1921  /**
1922   * For embedded tokenizers: this is the base tokenizer that reads the data
1923   */

1924  protected AbstractTokenizer _baseTokenizer = null;
1925
1926  /**
1927   * For embedded tokenizers: this is the list of the previous tokenizers
1928   */

1929  protected AbstractTokenizer _prevTokenizer = null;
1930  
1931  /**
1932   * Whitespace handler
1933   */

1934  private de.susebox.jtopas.spi.WhitespaceHandler _whitespaceHandler = null;
1935
1936  /**
1937   * Separator handler
1938   */

1939  private de.susebox.jtopas.spi.SeparatorHandler _separatorHandler = null;
1940
1941  /**
1942   * Keyword handler
1943   */

1944  private de.susebox.jtopas.spi.KeywordHandler _keywordHandler = null;
1945
1946  /**
1947   * Sequence handler
1948   */

1949  private de.susebox.jtopas.spi.SequenceHandler _sequenceHandler = null;
1950  
1951  /**
1952   * Sequence handler
1953   */

1954  private de.susebox.jtopas.spi.PatternHandler _patternHandler = null;
1955  
1956  /**
1957   * The source of input data
1958   */

1959  private TokenizerSource _source = null;
1960  
1961  /**
1962   * The characteristics of this tokenizer.
1963   */

1964  private TokenizerProperties _properties = null;
1965  
1966  /**
1967   * Line number to position mapping
1968   */

1969  private TreeMap JavaDoc _position2LineMap = null;
1970  
1971  /**
1972   * Control flags for the internal work
1973   */

1974  private long _internalFlags = 0;
1975}
1976
Popular Tags