Tokenizer


1   /*
2    * Tokenizer.java: lexical parser interface.
3    *
4    * Copyright (C) 2001 Heiko Blau
5    *
6    * This file belongs to the JTopas Library.
7    * JTopas is free software; you can redistribute it and/or modify it 
8    * under the terms of the GNU Lesser General Public License as published by the 
9    * Free Software Foundation; either version 2.1 of the License, or (at your 
10   * option) any later version.
11   *
12   * This software is distributed in the hope that it will be useful, but WITHOUT
13   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
14   * FITNESS FOR A PARTICULAR PURPOSE. 
15   * See the GNU Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public License along
18   * with JTopas. If not, write to the
19   *
20   *   Free Software Foundation, Inc.
21   *   59 Temple Place, Suite 330, 
22   *   Boston, MA 02111-1307 
23   *   USA
24   *
25   * or check the Internet: http://www.fsf.org
26   *
27   * Contact:
28   *   email: heiko@susebox.de 
29   */
30  
31  package de.susebox.jtopas;
32  
33  //-----------------------------------------------------------------------------
34  // Imports
35  //
36  import de.susebox.jtopas.spi.WhitespaceHandler;
37  import de.susebox.jtopas.spi.SeparatorHandler;
38  import de.susebox.jtopas.spi.KeywordHandler;
39  import de.susebox.jtopas.spi.SequenceHandler;
40  import de.susebox.jtopas.spi.PatternHandler;
41  
42  
43  //-----------------------------------------------------------------------------
44  // Interface Tokenizer
45  //
46  
47  /**<p>
48   * The interface <code>Tokenizer</code> contains setup methods, parse operations 
49   * and other getter and setter methods for a tokenizer. A tokenizer splits a
50   * stream of input data into various units like whitespaces, comments, keywords
51   * etc. These units are the tokens that are reflected in the {@link Token} class
52   * of the <code>de.susebox.jtopas</code> package.
53   *</p><p> 
54   * A <code>Tokenizer</code> is configured using a {@link TokenizerProperties} 
55   * object that contains declarations for whitespaces, separators, comments, 
56   * keywords, special sequences and patterns. It is designed to enable a common 
57   * approach for parsing texts like program code, annotated documents like HTML 
58   * and so on.
59   *</p><p>
60   * To detect links in an HTML document, a tokenizer would be invoked like that
61   * (see {@link StandardTokenizerProperties} and {@link StandardTokenizer} for the
62   * classes mentioned here):
63   *<blockquote><pre>
64   *
65   * Vector               links     = new Vector();
66   * FileReader           reader    = new FileReader("index.html");
67   * TokenizerProperties  props     = new StandardTokenizerProperties();
68   * Tokenizer            tokenizer = new StandardTokenizer();
69   * Token                token;
70   *
71   * props.setParseFlags(Tokenizer.F_NO_CASE);
72   * props.setSeparators("=");
73   * props.addString("\"", "\"", "\\");
74   * props.addBlockComment("&gt;", "&lt;");
75   * props.addKeyword("HREF");
76   *
77   * tokenizer.setTokenizerProperties(props);
78   * tokenizer.setSource(new ReaderSource(reader));
79   *
80   * try {
81   *   while (tokenizer.hasMoreToken()) {
82   *     token = tokenizer.nextToken();
83   *     if (token.getType() == Token.KEYWORD) {
84   *       tokenizer.nextToken();               // should be the '=' character
85   *       links.addElement(tokenizer.next());
86   *     }
87   *   }
88   * } finally {
89   *   tokenizer.close();
90   *   reader.close();
91   * }
92   *
93   *</pre></blockquote>
94   * This somewhat rough way to find links should work fine on syntactically
95   * correct HTML code. It finds common links as well as mail, ftp links etc. Note
96   * the block comment. It starts with the "&gt;" character, that is the closing
97   * character for HTML tags and ends with the "&lt;" being the starting character
98   * of HTML tags. The effect is that all the real text is treated as a comment.
99   *</p><p>
100  * To extract the contents of a HTML file, one would write:
101  *<blockquote><pre>
102  *
103  * StringBuffer         contents  = new StringBuffer(4096);
104  * FileReader           reader    = new FileReader("index.html");
105  * TokenizerProperties  props     = new StandardTokenizerProperties();
106  * Tokenizer            tokenizer = new StandardTokenizer();
107  * Token                token;
108  *
109  * props.setParseFlags(Tokenizer.F_NO_CASE);
110  * props.addBlockComment("&gt;", "&lt;");
111  * props.addBlockComment("&gt;HEAD&lt;", "&gt;/HEAD&lt;");
112  * props.addBlockComment("&gt;!--;", "--&lt;");
113  *    
114  * tokenizer.setTokenizerProperties(props);
115  * tokenizer.setSource(new ReaderSource(reader));
116  *
117  * try {
118  *   while (tokenizer.hasMoreToken()) {
119  *     token = tokenizer.nextToken();
120  *     if (token.getType() != Token.BLOCK_COMMENT) {
121  *       contents.append(token.getToken());
122  *     }
123  *   }
124  * } finally {
125  *   tokenizer.close();
126  *   reader.close();
127  * }
128  *
129  *</pre></blockquote>
130  * Here the block comment is the exact opposite of the first example. Now all the
131  * HTML tags are skipped. Moreover, we declared the HTML-Header as a block
132  * comment as well - the informations from the header are thus skipped alltogether.
133  *</p><p>
134  * Parsing (tokenizing) is done on a well defined priority scheme. See 
135  * {@link #nextToken} for details.
136  *</p><p>
137  * NOTE: if a character sequence is registered for two categories of tokenizer
138  * properties (e.g. as a line comments starting sequence as well as a special
139  * sequence), the category with the highest priority wins (e.g. if the metioned
140  * sequence is found, it is interpreted as a line comment). 
141  *</p><p>
142  * The tokenizer interface is clearly designed for "readable" data, say ASCII-
143  * or UNICODE data. Parsing binary data has other characteristics that do not
144  * necessarily fit in a scheme of comments, keywords, strings, identifiers and 
145  * operators.
146  *</p><p>
147  * Note that the interface has no methods that handle stream data sources. This
148  * is left to the implementations that may have quite different data sources, e. g.
149  * {@link java.io.InputStreamReader}, database queries, string arrays etc. The
150  * interface {@link TokenizerSource} serves as an abstraction of such widely 
151  * varying data sources.
152  *</p><p>
153  * The <code>Tokenizer</code> interface partly replaces the older 
154  * {@link de.susebox.java.util.Tokenizer} interface which is deprecated.
155  *</p>
156  *
157  * @see     Token
158  * @see     TokenizerProperties
159  * @author  Heiko Blau
160  */
161 public interface Tokenizer {
162 
163   //---------------------------------------------------------------------------
164   // data source
165   //
166   
167   /**
168    * Setting the source of data. This method is usually called during setup of
169    * the <code>Tokenizer</code> but may also be invoked while the tokenizing
170    * is in progress. It will reset the tokenizers input buffer, line and column 
171    * counters etc.
172    *<br>
173    * It is allowed to pass <code>null</code>. Calls to {@link #hasMoreToken}
174    * will return <code>false</code>, while calling {@link #nextToken} will return
175    * an EOF token.
176    *
177    * @param source  a {@link TokenizerSource} to read data from
178    * @see #getSource
179    */
180   public void setSource(TokenizerSource source);
181   
182   /**
183    * Retrieving the {@link TokenizerSource} of this <code>Tokenizer</code>. The
184    * method may return <code>null</code> if there is no <code>TokenizerSource</code>
185    * associated with this <code>Tokenizer</code>.
186    *
187    * @return  the {@link TokenizerSource} associated with this <code>Tokenizer</code>
188    * @see #setSource
189    */
190   public TokenizerSource getSource();
191   
192   
193   //---------------------------------------------------------------------------
194   // configuration
195   //
196   
197   /**
198    * Setting the tokenizer characteristics. This operation is usually done before
199    * the parse process. A common place is a constructor of a <code>Tokenizer</code>
200    * implementation. If the tokenizer characteristics change during the parse
201    * process they take effect with the next call of {@link #nextToken} or 
202    * {@link #nextImage}. Usually, a <code>Tokenizer</code> implementation will
203    * also implement the {@link TokenizerPropertyListener} interface to be notified
204    * about property changes. 
205    *<br>
206    * Generally, the <code>Tokenizer</code> implementation should also implement
207    * the {@link de.susebox.jtopas.spi.DataProvider} interface or provide an inner
208    * class that implements the <code>DataProvider</code> interface, while the
209    * {@link TokenizerProperties} implementation should in turn implement the
210    * interfaces 
211    *<ul><li>
212    *    {@link de.susebox.jtopas.spi.WhitespaceHandler}, 
213    *</li><li>
214    *    {@link de.susebox.jtopas.spi.SeparatorHandler}, 
215    *</li><li>
216    *    {@link de.susebox.jtopas.spi.SequenceHandler}, 
217    *</li><li>
218    *    {@link de.susebox.jtopas.spi.KeywordHandler} and 
219    *</li><li>
220    *    {@link de.susebox.jtopas.spi.PatternHandler}
221    *</li></ul>
222    * These handler interfaces are collected in the {@link de.susebox.jtopas.spi.DataMapper}
223    * interface.
224    *<br>
225    * Although the implementation of the mentioned interfaces is recommended, it 
226    * is not a mandatory way. Except for {@link de.susebox.jtopas.spi.PatternHandler}
227    * that must be implemented by the {@link TokenizerProperties} implementation,
228    * since it is not possible for a <code>Tokenizer</code> to interpret a regular
229    * expression pattern only with the information provided through the 
230    * <code>TokenizerProperties</code> interface. 
231    *<br>
232    * If a <code>Tokenizer</code> implementation chooses to use a exclusively tailored 
233    * {@link TokenizerProperties} implementation, it should throw an 
234    * {@link java.lang.IllegalArgumentException} if it is not provided with an 
235    * instance of that {@link TokenizerProperties} implementation.
236    *<br>
237    * If <code>null</code> is passed to the method it throws 
238    * {@link java.lang.NullPointerException}.
239    *
240    * @param   props   the {@link TokenizerProperties} for this tokenizer
241    * @throws  NullPointerException if the <code>null</code> is passed to the call
242    * @throws  IllegalArgumentException if the {@link TokenizerProperties} implementation
243    *          of the parameter cannot be used with the implementation of this
244    *          <code>Tokenizer</code>
245    * @see     #getTokenizerProperties
246    */
247   public void setTokenizerProperties(TokenizerProperties props) throws NullPointerException  , IllegalArgumentException  ;
248   
249 
250   /**
251    * Retrieving the current tokenizer characteristics. The method may return
252    * <code>null</code> if {@link #setTokenizerProperties} has not been called so 
253    * far.
254    *
255    * @return  the {@link TokenizerProperties} of this <code>Tokenizer</code>
256    * @see     #setTokenizerProperties
257    */
258   public TokenizerProperties getTokenizerProperties();
259   
260 
261   /**
262    * Setting the control flags of the <code>TokenizerProperties</code>. Use a 
263    * combination of the <code>F_...</code> flags declared in {@link TokenizerProperties}
264    * for the parameter. The <code>mask</code> parameter contains a bit mask of
265    * the <code>F_...</code> flags to change.
266    *<br>
267    * The parse flags for a tokenizer can be set through the associated 
268    * {@link TokenizerProperties} instance. These global settings take effect in all 
269    * <code>Tokenizer</code> instances that use the same <code>TokenizerProperties</code>
270    * object. Flags related to the parsing process can also be set separately
271    * for each tokenizer during runtime. These are the dynamic flags:
272    *<ul><li>
273    *  {@link TokenizerProperties#F_RETURN_WHITESPACES} and its sub-flags
274    *</li><li>
275    *  {@link TokenizerProperties#F_TOKEN_POS_ONLY}
276    *</li></ul>
277    * Other flags can also be set for each tokenizer separately, but should be set
278    * before the tokenizing starts to make sense.
279    *<ul><li>
280    *  {@link TokenizerProperties#F_KEEP_DATA}
281    *</li><li>
282    *  {@link TokenizerProperties#F_COUNT_LINES}
283    *</li></ul>
284    * The other flags should only be used on the <code>TokenizerProperties</code>
285    * instance or on single {@link TokenizerProperty} objects and influence all 
286    * <code>Tokenizer</code> instances sharing the same <code>TokenizerProperties</code> 
287    * object. For instance, using the flag {@link TokenizerProperties#F_NO_CASE} 
288    * is an invalid operation on a <code>Tokenizer</code>. It affects the interpretation 
289    * of keywords and sequences by the associated <code>TokenizerProperties</code> 
290    * instance and, moreover, possibly the storage of these properties. 
291    *<br>
292    * This method throws a {@link TokenizerException} if a flag is passed that cannot
293    * be handled by the <code>Tokenizer</code> object itself.
294    *<br>
295    * This method takes precedence over the {@link TokenizerProperties#setParseFlags}
296    * method of the associated <code>TokenizerProperties</code> object. Even if 
297    * the global settings of one of the dynamic flags (see above) change after a
298    * call to this method, the flags set separately for this tokenizer, stay
299    * active.
300    *
301    * @param flags the parser control flags
302    * @param mask  the mask for the flags to set or unset
303    * @throws TokenizerException if one or more of the flags given cannot be honored
304    * @see   #getParseFlags
305    */
306   public void changeParseFlags(int flags, int mask) throws TokenizerException;
307 
308    /**
309     * Retrieving the parser control flags. A bitmask containing the <code>F_...</code>
310     * constants is returned. This method returns both the flags that are set
311     * separately for this <code>Tokenizer</code> and the flags set for the 
312     * associated {@link TokenizerProperties} object.
313     *
314     * @return the current parser control flags
315     * @see #changeParseFlags
316     */
317   public int getParseFlags();
318   
319   /**
320    * Setting a new {@link de.susebox.jtopas.spi.KeywordHandler} or removing any 
321    * previously installed one. If <code>null</code> is passed (installed handler 
322    * removed), no keyword support is available.
323    *<br>
324    * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
325    * implement the {@link de.susebox.jtopas.spi.KeywordHandler} interface. If so,
326    * the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code> 
327    * instance as its <code>KeywordHandler</code>. A different or a handler specific
328    * to a certain <code>Tokenizer</code> instance, can be set using this method.
329    *
330    * @param handler the (new) {@link de.susebox.jtopas.spi.KeywordHandler} to use 
331    *                or <code>null</code> to remove it
332    * @see   #getKeywordHandler
333    * @see   TokenizerProperties#addKeyword
334    */
335   public void setKeywordHandler(de.susebox.jtopas.spi.KeywordHandler handler);
336   
337    /**
338     * Retrieving the current {@link de.susebox.jtopas.spi.KeywordHandler}. The 
339     * method may return <code>null</code> if there isn't any handler installed.
340     *
341     * @return the currently active {@link de.susebox.jtopas.spi.KeywordHandler} 
342     *         or <code>null</code>, if keyword support is switched off
343     * @see #setKeywordHandler
344     */
345   public de.susebox.jtopas.spi.KeywordHandler getKeywordHandler();
346   
347   /**
348    * Setting a new {@link de.susebox.jtopas.spi.WhitespaceHandler} or removing 
349    * any previously installed one. If <code>null</code> is passed, the tokenizer 
350    * will not recognize whitespaces.
351    *<br>
352    * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
353    * implement the {@link de.susebox.jtopas.spi.WhitespaceHandler} interface. If
354    * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code> 
355    * instance as its <code>WhitespaceHandler</code>. A different handler or a 
356    * handler specific to a certain <code>Tokenizer</code> instance, can be set 
357    * using this method.
358    *
359    * @param handler the (new) whitespace handler to use or <code>null</code> to 
360    *                switch off whitespace handling
361    * @see   #getWhitespaceHandler
362    * @see   TokenizerProperties#setWhitespaces
363    */
364   public void setWhitespaceHandler(de.susebox.jtopas.spi.WhitespaceHandler handler);
365   
366   /**
367    * Retrieving the current {@link de.susebox.jtopas.spi.WhitespaceHandler}. The 
368    * method may return <code>null</code> if there whitespaces are not recognized.
369    *
370    * @return  the currently active whitespace handler or null, if the base
371    *          implementation is working
372    * @see #setWhitespaceHandler
373    */
374   public de.susebox.jtopas.spi.WhitespaceHandler getWhitespaceHandler();
375   
376   
377   /**
378    * Setting a new {@link de.susebox.jtopas.spi.SeparatorHandler} or removing any 
379    * previously installed <code>SeparatorHandler</code>. If <code>null</code> is 
380    * passed, the tokenizer doesn't recognize separators.
381    *<br>
382    * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
383    * implement the {@link de.susebox.jtopas.spi.SeparatorHandler} interface. If
384    * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code> 
385    * instance as its <code>SeparatorHandler</code>. A different handler or a 
386    * handler specific to a certain <code>Tokenizer</code> instance, can be set 
387    * using this method.
388    *
389    * @param handler the (new) separator handler to use or <code>null</code> to
390    *                remove it
391    * @see   #getSeparatorHandler
392    * @see   TokenizerProperties#setSeparators
393    */
394   public void setSeparatorHandler(de.susebox.jtopas.spi.SeparatorHandler handler);
395   
396   /**
397    * Retrieving the current {@link de.susebox.jtopas.spi.SeparatorHandler}. The 
398    * method may return <code>null</code> if there isn't any handler installed.
399    *
400    * @return  the currently active {@link de.susebox.jtopas.spi.SeparatorHandler} 
401    *          or <code>null</code>, if separators aren't recognized by the tokenizer
402    * @see     #setSeparatorHandler
403    */
404   public de.susebox.jtopas.spi.SeparatorHandler getSeparatorHandler();
405   
406   
407   /**
408    * Setting a new {@link de.susebox.jtopas.spi.SequenceHandler} or removing any 
409    * previously installed one. If <code>null</code> is passed, the tokenizer will
410    * not recognize line and block comments, strings and special sequences.
411    *<br>
412    * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
413    * implement the {@link de.susebox.jtopas.spi.SequenceHandler} interface. If
414    * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code> 
415    * instance as its <code>SeparatorHandler</code>. A different handler or a 
416    * handler specific to a certain <code>Tokenizer</code> instance, can be set 
417    * using this method.
418    *
419    * @param handler the (new) {@link de.susebox.jtopas.spi.SequenceHandler} to 
420    *                use or <code>null</code> to remove it
421    * @see #getSequenceHandler
422    * @see TokenizerProperties#addSpecialSequence
423    * @see TokenizerProperties#addLineComment
424    * @see TokenizerProperties#addBlockComment
425    * @see TokenizerProperties#addString
426    */
427   public void setSequenceHandler(de.susebox.jtopas.spi.SequenceHandler handler);
428   
429   /**
430    * Retrieving the current {@link de.susebox.jtopas.spi.SequenceHandler}. The 
431    * method may return <code>null</code> if there isn't any handler installed.
432    *<br>
433    * A <code>SequenceHandler</code> deals with line and block comments, strings
434    * and special sequences.
435    *
436    * @return  the currently active {@link de.susebox.jtopas.spi.SequenceHandler} 
437    *          or <code>null</code>, if no 
438    * @see #setSequenceHandler
439    */
440   public de.susebox.jtopas.spi.SequenceHandler getSequenceHandler();
441   
442   
443   /**
444    * Setting a new {@link de.susebox.jtopas.spi.PatternHandler} or removing any 
445    * previously installed one. If <code>null</code> is passed, pattern are not 
446    * supported by the tokenizer (any longer).
447    *<br>
448    * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
449    * implement the {@link de.susebox.jtopas.spi.PatternHandler} interface. If
450    * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code> 
451    * instance as its <code>PatternHandler</code>. A different handler or a 
452    * handler specific to a certain <code>Tokenizer</code> instance, can be set 
453    * using this method.
454    *
455    * @param handler the (new) {@link de.susebox.jtopas.spi.PatternHandler} to 
456    *                use or <code>null</code> to remove it
457    * @see #getPatternHandler
458    * @see TokenizerProperties#addPattern
459    */
460   public void setPatternHandler(de.susebox.jtopas.spi.PatternHandler handler);
461   
462   /**
463    * Retrieving the current {@link de.susebox.jtopas.spi.PatternHandler}. The method 
464    * may return <code>null</code> if there isn't any handler installed.
465    *
466    * @return  the currently active {@link de.susebox.jtopas.spi.PatternHandler} 
467    *          or <code>null</code>, if patterns are not recognized by the tokenizer
468    * @see #setPatternHandler
469    */
470   public de.susebox.jtopas.spi.PatternHandler getPatternHandler();
471   
472 
473   //---------------------------------------------------------------------------
474   // tokenizer operations
475   //
476 
477   /**
478    * Check if there are more tokens available. This method will return
479    * <code>true</code> until and enf-of-file condition is encountered during a 
480    * call to {@link #nextToken} or {@link #nextImage}.
481    *<br>
482    * That means, that the EOF is returned one time, afterwards <code>hasMoreToken</code>
483    * will return <code>false</code>. Furthermore, that implies, that the method
484    * will return <code>true</code> at least once, even if the input data stream
485    * is empty.
486    *<br>
487    * The method can be conveniently used in a while loop.
488    *
489    * @return  <code>true</code> if a call to {@link #nextToken} or {@link #nextImage}
490    *          will succed, <code>false</code> otherwise
491    */
492   public boolean hasMoreToken();
493   
494   /**
495    * Retrieving the next {@link Token}. The method works in this order:
496    *<ol><li>
497    *   Check for an end-of-file condition. If there is such a condition then
498    *   return it.
499    *</li><li>
500    *   Try to collect a sequence of whitespaces. If such a sequence can be found
501    *   return if the flag <code>F_RETURN_WHITESPACES</code> is set, or skip these
502    *   whitespaces.
503    *</li><li>
504    *   Check the next characters against all known pattern. A pattern is usually 
505    *   a regular expression that is used by {@link java.util.regex.Pattern}. But
506    *   implementations of {@link de.susebox.jtopas.spi.PatternHandler} may use
507    *   other pattern syntaxes. Note that pattern are not recognized within
508    *   "normal" text (see below for a more precise description).
509    *</li><li>
510    *   Check the next characters against all known line and block comments. If
511    *   a line or block comment starting sequence matches, return if the flag
512    *   <code>F_RETURN_WHITESPACES</code> is set, or skip the comment.
513    *   If comments are returned they include their starting and ending sequences
514    *   (newline in case of a line comment).
515    *</li><li>
516    *   Check the next characters against all known string starting sequences. If
517    *   a string begin could be identified return the string until and including
518    *   the closing sequence.
519    *</li><li>
520    *   Check the next characters against all known special sequences. Especially,
521    *   find the longest possible match. If a special sequence could be identified
522    *   then return it.
523    *</li><li>
524    *   Check for ordinary separators. If one could be found return it.
525    *</li><li>
526    *   Check the next characters against all known keywords. If a keyword could
527    *   be identified then return it.
528    *</li><li>
529    *   Return the text portion until the next whitespace, comment, special
530    *   sequence or separator. Note that pattern are not recognized within "normal"
531    *   text. A pattern match has therefore always a whitespace, comment, special
532    *   sequence, separator or another pattern match in front of it or starts at 
533    *   position 0 of the data.
534    *</li></ol>
535    * The method will return the EOF token as long as {@link #hasMoreToken} returns
536    * <code>false</code>. It will not return <code>null</code> in such conditions.
537    *
538    * @return found {@link Token} including the EOF token
539    * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
540    * (IOExceptions for instance)
541    * @see   #nextImage
542    */
543   public Token nextToken() throws TokenizerException;
544  
545   /**
546    * This method is a convenience method. It returns only the next token image
547    * without any informations about its type or associated information. This is 
548    * an especially usefull method, if the parse flags for this <code>Tokenizer</code> 
549    * have the flag {@link TokenizerProperties#F_TOKEN_POS_ONLY} set, since this 
550    * method returns a valid string even in that case.
551    *
552    * @return the token image of the next token
553    * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
554    * (IOExceptions for instance)
555    * @see   #nextToken
556    * @see   #currentImage
557    */
558   public String   nextImage() throws TokenizerException;
559  
560   /**
561    * Retrieve the {@link Token} that was found by the last call to {@link #nextToken}.
562    * or {@link #nextImage}. 
563    *<br>
564    * Since version 0.6.1 of JTopas, this method throws a {@link TokenizerException}
565    * rather than returning <code>null</code> if neither {@link #nextToken} nor 
566    * {@link #nextImage} have been called before or {@link #setReadPositionRelative}
567    * or {@link #setReadPositionAbsolute} habe been called after the last call to
568    * <code>nextToken</code> or <code>nextImage</code>.
569    *
570    * @return  the {@link Token} retrieved by the last call to {@link #nextToken}.
571    * @throws  TokenizerException if the tokenizer has no current token
572    * @see     #nextToken
573    * @see     #currentImage
574    */
575   public Token currentToken() throws TokenizerException;
576  
577   /**
578    * Convenience method to retrieve only the token image of the {@link Token} that
579    * would be returned by {@link #currentToken}. This is an especially usefull
580    * method, if the parse flags for this <code>Tokenizer</code> have the
581    * flag {@link TokenizerProperties#F_TOKEN_POS_ONLY} set, since this method
582    * returns a valid string even in that case.
583    *<br>
584    * Since version 0.6.1 of JTopas, this method throws a {@link TokenizerException}
585    * rather than returning <code>null</code> if neither {@link #nextToken} nor 
586    * {@link #nextImage} have been called before or {@link #setReadPositionRelative}
587    * or {@link #setReadPositionAbsolute} habe been called after the last call to
588    * <code>nextToken</code> or <code>nextImage</code>.
589    *
590    * @return  the token image of the current token
591    * @throws  TokenizerException if the tokenizer has no current token
592    * @see     #currentToken
593    * @see     #nextImage
594    */
595   public String   currentImage() throws TokenizerException;
596 
597   
598   //---------------------------------------------------------------------------
599   // line and column positions
600   //
601   
602   /**
603    * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method 
604    * will return the line number starting with 0 in the input stream. The 
605    * implementation of the <code>Tokenizer</code> interface can decide which 
606    * end-of-line sequences should be recognized. The most flexible approach is 
607    * to process the following end-of-line sequences:
608    * <br><ul><li>
609    * Carriage Return (ASCII 13, '\r'). This EOL is used on Apple Macintosh
610    * </li><li>
611    * Linefeed (ASCII 10, '\n'). This is the UNIX EOL character.
612    * </li><li>
613    * Carriage Return + Linefeed ("\r\n"). This is used on MS Windows systems.
614    * </li></ul>
615    * Another legitime and in many cases satisfying way is to use the system 
616    * property "line.separator".
617    *<br>
618    * Displaying information about lines usually means adding 1 to the zero-based
619    * line number.
620    *
621    * @return  the current line number starting with 0 or -1 if no line numbers 
622    *          are supplied ({@link TokenizerProperties#F_COUNT_LINES} is not set).
623    * @see #getColumnNumber
624    */  
625   public int getLineNumber();
626   
627   /**
628    * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method 
629    * will return the current column position starting with 0 in the input stream.
630    * Displaying information about columns usually means adding 1 to the zero-based
631    * column number.
632    *
633    * @return  the current column position or -1 if the flag if no column numbers
634    *          are supplied {@link TokenizerProperties#F_COUNT_LINES} is not set).
635    *          is not set
636    * @see     #getLineNumber
637    */  
638   public int getColumnNumber();
639   
640   
641   //---------------------------------------------------------------------------
642   // text range operations
643   //
644   
645   /**
646    * This method returns the absolute offset in characters to the start of the
647    * parsed stream. Together with {@link #currentlyAvailable} it describes the
648    * currently available text "window".
649    *<br>
650    * The position returned by this method and also by {@link #getReadPosition}
651    * are absolute rather than relative in a text buffer to give the tokenizer
652    * the full control of how and when to refill its text buffer.
653    *
654    * @return the absolute offset of the current text window in characters from 
655    *         the start of the data source of the Tokenizer
656    */
657   public int getRangeStart();
658   
659   /**
660    * Getting the current read offset. This is the absolute position where the
661    * next call to <code>nextToken</code> or <code>next</code> will start. It is
662    * therefore <b><k>not</k></b> the same as the position returned by 
663    * {@link Token#getStartPosition} of the current token ({@link #currentToken}). 
664    *<br>
665    * It is the starting position of the token returned by the next call to 
666    * {@link #nextToken}, if that token is no whitespace or if whitespaces are
667    * returned ({@link  TokenizerProperties#F_RETURN_WHITESPACES}).
668    *<br>
669    * The position returned by this method and also by {@link #getRangeStart}
670    * are absolute rather than relative in a text buffer to give the tokenizer
671    * the full control of how and when to refill its text buffer.
672    *
673    * @return the absolute offset in characters from the start of the data source 
674    *         of the Tokenizer where reading will be continued
675    */
676   public int getReadPosition();
677   
678   /**
679    * Retrieving the number of the currently available characters. This includes
680    * both characters already parsed by the <code>Tokenizer</code> and characters
681    * still to be analyzed.<br>
682    *
683    * @return number of currently available characters
684    */
685   public int currentlyAvailable();
686   
687   /**
688    * Retrieve text from the currently available range. The start and length
689    * parameters must be inside {@link #getRangeStart} and
690    * {@link #getRangeStart} + {@link #currentlyAvailable}.
691    *<br>
692    * Example:
693    *<block><pre>
694    *    int     startPos = tokenizer.getReadPosition();
695    *    String  source;
696    *
697    *    while (tokenizer.hasMoreToken()) {
698    *      Token token = tokenizer.nextToken();
699    *      
700    *      switch (token.getType()) {
701    *      case Token.LINE_COMMENT:
702    *      case Token.BLOCK_COMMENT:
703    *        source   = tokenizer.getText(startPos, token.getStartPos() - startPos);
704    *        startPos = token.getStartPos();
705    *      }
706    *    }
707    *</pre></block>
708    *
709    * @param   start   position where the text begins
710    * @param   length  length of the text
711    * @return  the text beginning at the given position ith the given length
712    * @throws  IndexOutOfBoundsException if the starting position or the length is 
713    *          out of the current text window
714    */
715   public String   getText(int start, int length) throws IndexOutOfBoundsException  ;
716   
717   /**
718    * Get a single character from the current text range.
719    *
720    * @param pos position of the required character
721    * @return the character at the specified position
722    * @throws IndexOutOfBoundsException if the parameter <code>pos</code> is not 
723    *         in the available text range (text window)
724    */
725   public char getChar(int pos) throws IndexOutOfBoundsException  ;
726   
727   /**
728    * Try to read more data into the text buffer of the tokenizer. This can be
729    * useful when a method needs to look ahead of the available data or a skip
730    * operation should be performed.
731    *<br>
732    * The method returns the same value than an immediately following call to 
733    * {@link #currentlyAvailable} would return.
734    *
735    * @return  the number of character now available
736    * @throws  TokenizerException generic exception (list) for all problems that 
737    *          may occur while reading (IOExceptions for instance)
738    */
739   public int readMore() throws TokenizerException;
740   
741   /**
742    * This method sets the tokenizers current read position to the given absolute 
743    * read position. It realizes one type of rewind / forward operations. The 
744    * given position must be inside the intervall {@link #getRangeStart} and 
745    * {@link #getRangeStart} + {@link #currentlyAvailable} - 1.
746    *<br>
747    * The current read position is the end position of the current token. That means
748    * that the following assertion can be made:
749    *<pre>
750    *    Token token1 = tokenizer.nextToken();
751    *    tokenizer.setReadPositionAbsolute(tokenizer.getReadPosition() - token1.getLength());
752    *    Token token2 = tokenizer.nextToken();
753    *    assert(token1.equals(token2));
754    *</pre>
755    *<br>
756    * Since JTopas version 0.6.1, the operation clears the current token. Therefore,
757    * {@link #currentImage} and {@link #currentToken} will throw a {@link TokenizerException}
758    * if called after a <code>setReadPositionAbsolute</code> without a subsequent
759    * call to {@link #nextToken} of {@link #nextImage}.
760    *
761    * @param   position  absolute position for the next parse operation
762    * @throws  IndexOutOfBoundsException if the parameter <code>position</code> is 
763    *          not in the available text range (text window)
764    * @see     #setReadPositionRelative
765    */
766   public void setReadPositionAbsolute(int position) throws IndexOutOfBoundsException  ;
767   
768   /**
769    * This method sets the tokenizers new read position the given number of characters
770    * forward (positive value) or backward (negative value) starting from the current
771    * read position. It realizes one type of rewind / forward operations. The 
772    * given offset must be greater or equal than {@link #getRangeStart} - {@link #getReadPosition}
773    * and lower than {@link #currentlyAvailable} - {@link #getReadPosition}.
774    *<br>
775    * Since JTopas version 0.6.1, the operation clears the current token. Therefore,
776    * {@link #currentImage} and {@link #currentToken} will throw a {@link TokenizerException}
777    * if called after a <code>setReadPositionAbsolute</code> without a subsequent
778    * call to {@link #nextToken} of {@link #nextImage}.
779    *
780    * @param   offset  number of characters to move forward (positive offset) or
781    *                  backward (negative offset)
782    * @throws  IndexOutOfBoundsException if the parameter <code>offset</code> would
783    *          move the read position out of the available text range (text window)
784    * @see     #setReadPositionAbsolute
785    */
786   public void setReadPositionRelative(int offset) throws IndexOutOfBoundsException  ;
787 
788 
789   //---------------------------------------------------------------------------
790   // Cleanup
791   //
792   
793   /**
794    * This method is nessecary to release memory and remove object references if
795    * a <code>Tokenizer</code> instances are frequently created for small tasks.
796    * Generally, the method shouldn't throw any exceptions. It is also ok to call
797    * it more than once.
798    *<br>
799    * It is an error, to call any other method of the implementing class after
800    * <code>close</code> has been called.
801    */
802   public void close();
803 }
804
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags