KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > de > susebox > jtopas > Tokenizer


1 /*
2  * Tokenizer.java: lexical parser interface.
3  *
4  * Copyright (C) 2001 Heiko Blau
5  *
6  * This file belongs to the JTopas Library.
7  * JTopas is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU Lesser General Public License as published by the
9  * Free Software Foundation; either version 2.1 of the License, or (at your
10  * option) any later version.
11  *
12  * This software is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.
15  * See the GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License along
18  * with JTopas. If not, write to the
19  *
20  * Free Software Foundation, Inc.
21  * 59 Temple Place, Suite 330,
22  * Boston, MA 02111-1307
23  * USA
24  *
25  * or check the Internet: http://www.fsf.org
26  *
27  * Contact:
28  * email: heiko@susebox.de
29  */

30
31 package de.susebox.jtopas;
32
33 //-----------------------------------------------------------------------------
34
// Imports
35
//
36
import de.susebox.jtopas.spi.WhitespaceHandler;
37 import de.susebox.jtopas.spi.SeparatorHandler;
38 import de.susebox.jtopas.spi.KeywordHandler;
39 import de.susebox.jtopas.spi.SequenceHandler;
40 import de.susebox.jtopas.spi.PatternHandler;
41
42
43 //-----------------------------------------------------------------------------
44
// Interface Tokenizer
45
//
46

47 /**<p>
48  * The interface <code>Tokenizer</code> contains setup methods, parse operations
49  * and other getter and setter methods for a tokenizer. A tokenizer splits a
50  * stream of input data into various units like whitespaces, comments, keywords
51  * etc. These units are the tokens that are reflected in the {@link Token} class
52  * of the <code>de.susebox.jtopas</code> package.
53  *</p><p>
54  * A <code>Tokenizer</code> is configured using a {@link TokenizerProperties}
55  * object that contains declarations for whitespaces, separators, comments,
56  * keywords, special sequences and patterns. It is designed to enable a common
57  * approach for parsing texts like program code, annotated documents like HTML
58  * and so on.
59  *</p><p>
60  * To detect links in an HTML document, a tokenizer would be invoked like that
61  * (see {@link StandardTokenizerProperties} and {@link StandardTokenizer} for the
62  * classes mentioned here):
63  *<blockquote><pre>
64  *
65  * Vector links = new Vector();
66  * FileReader reader = new FileReader("index.html");
67  * TokenizerProperties props = new StandardTokenizerProperties();
68  * Tokenizer tokenizer = new StandardTokenizer();
69  * Token token;
70  *
71  * props.setParseFlags(Tokenizer.F_NO_CASE);
72  * props.setSeparators("=");
73  * props.addString("\"", "\"", "\\");
74  * props.addBlockComment("&gt;", "&lt;");
75  * props.addKeyword("HREF");
76  *
77  * tokenizer.setTokenizerProperties(props);
78  * tokenizer.setSource(new ReaderSource(reader));
79  *
80  * try {
81  * while (tokenizer.hasMoreToken()) {
82  * token = tokenizer.nextToken();
83  * if (token.getType() == Token.KEYWORD) {
84  * tokenizer.nextToken(); // should be the '=' character
85  * links.addElement(tokenizer.next());
86  * }
87  * }
88  * } finally {
89  * tokenizer.close();
90  * reader.close();
91  * }
92  *
93  *</pre></blockquote>
94  * This somewhat rough way to find links should work fine on syntactically
95  * correct HTML code. It finds common links as well as mail, ftp links etc. Note
96  * the block comment. It starts with the "&gt;" character, that is the closing
97  * character for HTML tags and ends with the "&lt;" being the starting character
98  * of HTML tags. The effect is that all the real text is treated as a comment.
99  *</p><p>
100  * To extract the contents of a HTML file, one would write:
101  *<blockquote><pre>
102  *
103  * StringBuffer contents = new StringBuffer(4096);
104  * FileReader reader = new FileReader("index.html");
105  * TokenizerProperties props = new StandardTokenizerProperties();
106  * Tokenizer tokenizer = new StandardTokenizer();
107  * Token token;
108  *
109  * props.setParseFlags(Tokenizer.F_NO_CASE);
110  * props.addBlockComment("&gt;", "&lt;");
111  * props.addBlockComment("&gt;HEAD&lt;", "&gt;/HEAD&lt;");
112  * props.addBlockComment("&gt;!--;", "--&lt;");
113  *
114  * tokenizer.setTokenizerProperties(props);
115  * tokenizer.setSource(new ReaderSource(reader));
116  *
117  * try {
118  * while (tokenizer.hasMoreToken()) {
119  * token = tokenizer.nextToken();
120  * if (token.getType() != Token.BLOCK_COMMENT) {
121  * contents.append(token.getToken());
122  * }
123  * }
124  * } finally {
125  * tokenizer.close();
126  * reader.close();
127  * }
128  *
129  *</pre></blockquote>
130  * Here the block comment is the exact opposite of the first example. Now all the
131  * HTML tags are skipped. Moreover, we declared the HTML-Header as a block
132  * comment as well - the informations from the header are thus skipped alltogether.
133  *</p><p>
134  * Parsing (tokenizing) is done on a well defined priority scheme. See
135  * {@link #nextToken} for details.
136  *</p><p>
137  * NOTE: if a character sequence is registered for two categories of tokenizer
138  * properties (e.g. as a line comments starting sequence as well as a special
139  * sequence), the category with the highest priority wins (e.g. if the metioned
140  * sequence is found, it is interpreted as a line comment).
141  *</p><p>
142  * The tokenizer interface is clearly designed for "readable" data, say ASCII-
143  * or UNICODE data. Parsing binary data has other characteristics that do not
144  * necessarily fit in a scheme of comments, keywords, strings, identifiers and
145  * operators.
146  *</p><p>
147  * Note that the interface has no methods that handle stream data sources. This
148  * is left to the implementations that may have quite different data sources, e. g.
149  * {@link java.io.InputStreamReader}, database queries, string arrays etc. The
150  * interface {@link TokenizerSource} serves as an abstraction of such widely
151  * varying data sources.
152  *</p><p>
153  * The <code>Tokenizer</code> interface partly replaces the older
154  * {@link de.susebox.java.util.Tokenizer} interface which is deprecated.
155  *</p>
156  *
157  * @see Token
158  * @see TokenizerProperties
159  * @author Heiko Blau
160  */

161 public interface Tokenizer {
162
163   //---------------------------------------------------------------------------
164
// data source
165
//
166

167   /**
168    * Setting the source of data. This method is usually called during setup of
169    * the <code>Tokenizer</code> but may also be invoked while the tokenizing
170    * is in progress. It will reset the tokenizers input buffer, line and column
171    * counters etc.
172    *<br>
173    * It is allowed to pass <code>null</code>. Calls to {@link #hasMoreToken}
174    * will return <code>false</code>, while calling {@link #nextToken} will return
175    * an EOF token.
176    *
177    * @param source a {@link TokenizerSource} to read data from
178    * @see #getSource
179    */

180   public void setSource(TokenizerSource source);
181   
182   /**
183    * Retrieving the {@link TokenizerSource} of this <code>Tokenizer</code>. The
184    * method may return <code>null</code> if there is no <code>TokenizerSource</code>
185    * associated with this <code>Tokenizer</code>.
186    *
187    * @return the {@link TokenizerSource} associated with this <code>Tokenizer</code>
188    * @see #setSource
189    */

190   public TokenizerSource getSource();
191   
192   
193   //---------------------------------------------------------------------------
194
// configuration
195
//
196

197   /**
198    * Setting the tokenizer characteristics. This operation is usually done before
199    * the parse process. A common place is a constructor of a <code>Tokenizer</code>
200    * implementation. If the tokenizer characteristics change during the parse
201    * process they take effect with the next call of {@link #nextToken} or
202    * {@link #nextImage}. Usually, a <code>Tokenizer</code> implementation will
203    * also implement the {@link TokenizerPropertyListener} interface to be notified
204    * about property changes.
205    *<br>
206    * Generally, the <code>Tokenizer</code> implementation should also implement
207    * the {@link de.susebox.jtopas.spi.DataProvider} interface or provide an inner
208    * class that implements the <code>DataProvider</code> interface, while the
209    * {@link TokenizerProperties} implementation should in turn implement the
210    * interfaces
211    *<ul><li>
212    * {@link de.susebox.jtopas.spi.WhitespaceHandler},
213    *</li><li>
214    * {@link de.susebox.jtopas.spi.SeparatorHandler},
215    *</li><li>
216    * {@link de.susebox.jtopas.spi.SequenceHandler},
217    *</li><li>
218    * {@link de.susebox.jtopas.spi.KeywordHandler} and
219    *</li><li>
220    * {@link de.susebox.jtopas.spi.PatternHandler}
221    *</li></ul>
222    * These handler interfaces are collected in the {@link de.susebox.jtopas.spi.DataMapper}
223    * interface.
224    *<br>
225    * Although the implementation of the mentioned interfaces is recommended, it
226    * is not a mandatory way. Except for {@link de.susebox.jtopas.spi.PatternHandler}
227    * that must be implemented by the {@link TokenizerProperties} implementation,
228    * since it is not possible for a <code>Tokenizer</code> to interpret a regular
229    * expression pattern only with the information provided through the
230    * <code>TokenizerProperties</code> interface.
231    *<br>
232    * If a <code>Tokenizer</code> implementation chooses to use a exclusively tailored
233    * {@link TokenizerProperties} implementation, it should throw an
234    * {@link java.lang.IllegalArgumentException} if it is not provided with an
235    * instance of that {@link TokenizerProperties} implementation.
236    *<br>
237    * If <code>null</code> is passed to the method it throws
238    * {@link java.lang.NullPointerException}.
239    *
240    * @param props the {@link TokenizerProperties} for this tokenizer
241    * @throws NullPointerException if the <code>null</code> is passed to the call
242    * @throws IllegalArgumentException if the {@link TokenizerProperties} implementation
243    * of the parameter cannot be used with the implementation of this
244    * <code>Tokenizer</code>
245    * @see #getTokenizerProperties
246    */

247   public void setTokenizerProperties(TokenizerProperties props) throws NullPointerException JavaDoc, IllegalArgumentException JavaDoc;
248   
249
250   /**
251    * Retrieving the current tokenizer characteristics. The method may return
252    * <code>null</code> if {@link #setTokenizerProperties} has not been called so
253    * far.
254    *
255    * @return the {@link TokenizerProperties} of this <code>Tokenizer</code>
256    * @see #setTokenizerProperties
257    */

258   public TokenizerProperties getTokenizerProperties();
259   
260
261   /**
262    * Setting the control flags of the <code>TokenizerProperties</code>. Use a
263    * combination of the <code>F_...</code> flags declared in {@link TokenizerProperties}
264    * for the parameter. The <code>mask</code> parameter contains a bit mask of
265    * the <code>F_...</code> flags to change.
266    *<br>
267    * The parse flags for a tokenizer can be set through the associated
268    * {@link TokenizerProperties} instance. These global settings take effect in all
269    * <code>Tokenizer</code> instances that use the same <code>TokenizerProperties</code>
270    * object. Flags related to the parsing process can also be set separately
271    * for each tokenizer during runtime. These are the dynamic flags:
272    *<ul><li>
273    * {@link TokenizerProperties#F_RETURN_WHITESPACES} and its sub-flags
274    *</li><li>
275    * {@link TokenizerProperties#F_TOKEN_POS_ONLY}
276    *</li></ul>
277    * Other flags can also be set for each tokenizer separately, but should be set
278    * before the tokenizing starts to make sense.
279    *<ul><li>
280    * {@link TokenizerProperties#F_KEEP_DATA}
281    *</li><li>
282    * {@link TokenizerProperties#F_COUNT_LINES}
283    *</li></ul>
284    * The other flags should only be used on the <code>TokenizerProperties</code>
285    * instance or on single {@link TokenizerProperty} objects and influence all
286    * <code>Tokenizer</code> instances sharing the same <code>TokenizerProperties</code>
287    * object. For instance, using the flag {@link TokenizerProperties#F_NO_CASE}
288    * is an invalid operation on a <code>Tokenizer</code>. It affects the interpretation
289    * of keywords and sequences by the associated <code>TokenizerProperties</code>
290    * instance and, moreover, possibly the storage of these properties.
291    *<br>
292    * This method throws a {@link TokenizerException} if a flag is passed that cannot
293    * be handled by the <code>Tokenizer</code> object itself.
294    *<br>
295    * This method takes precedence over the {@link TokenizerProperties#setParseFlags}
296    * method of the associated <code>TokenizerProperties</code> object. Even if
297    * the global settings of one of the dynamic flags (see above) change after a
298    * call to this method, the flags set separately for this tokenizer, stay
299    * active.
300    *
301    * @param flags the parser control flags
302    * @param mask the mask for the flags to set or unset
303    * @throws TokenizerException if one or more of the flags given cannot be honored
304    * @see #getParseFlags
305    */

306   public void changeParseFlags(int flags, int mask) throws TokenizerException;
307
308    /**
309     * Retrieving the parser control flags. A bitmask containing the <code>F_...</code>
310     * constants is returned. This method returns both the flags that are set
311     * separately for this <code>Tokenizer</code> and the flags set for the
312     * associated {@link TokenizerProperties} object.
313     *
314     * @return the current parser control flags
315     * @see #changeParseFlags
316     */

317   public int getParseFlags();
318   
319   /**
320    * Setting a new {@link de.susebox.jtopas.spi.KeywordHandler} or removing any
321    * previously installed one. If <code>null</code> is passed (installed handler
322    * removed), no keyword support is available.
323    *<br>
324    * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
325    * implement the {@link de.susebox.jtopas.spi.KeywordHandler} interface. If so,
326    * the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code>
327    * instance as its <code>KeywordHandler</code>. A different or a handler specific
328    * to a certain <code>Tokenizer</code> instance, can be set using this method.
329    *
330    * @param handler the (new) {@link de.susebox.jtopas.spi.KeywordHandler} to use
331    * or <code>null</code> to remove it
332    * @see #getKeywordHandler
333    * @see TokenizerProperties#addKeyword
334    */

335   public void setKeywordHandler(de.susebox.jtopas.spi.KeywordHandler handler);
336   
337    /**
338     * Retrieving the current {@link de.susebox.jtopas.spi.KeywordHandler}. The
339     * method may return <code>null</code> if there isn't any handler installed.
340     *
341     * @return the currently active {@link de.susebox.jtopas.spi.KeywordHandler}
342     * or <code>null</code>, if keyword support is switched off
343     * @see #setKeywordHandler
344     */

345   public de.susebox.jtopas.spi.KeywordHandler getKeywordHandler();
346   
347   /**
348    * Setting a new {@link de.susebox.jtopas.spi.WhitespaceHandler} or removing
349    * any previously installed one. If <code>null</code> is passed, the tokenizer
350    * will not recognize whitespaces.
351    *<br>
352    * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
353    * implement the {@link de.susebox.jtopas.spi.WhitespaceHandler} interface. If
354    * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code>
355    * instance as its <code>WhitespaceHandler</code>. A different handler or a
356    * handler specific to a certain <code>Tokenizer</code> instance, can be set
357    * using this method.
358    *
359    * @param handler the (new) whitespace handler to use or <code>null</code> to
360    * switch off whitespace handling
361    * @see #getWhitespaceHandler
362    * @see TokenizerProperties#setWhitespaces
363    */

364   public void setWhitespaceHandler(de.susebox.jtopas.spi.WhitespaceHandler handler);
365   
366   /**
367    * Retrieving the current {@link de.susebox.jtopas.spi.WhitespaceHandler}. The
368    * method may return <code>null</code> if there whitespaces are not recognized.
369    *
370    * @return the currently active whitespace handler or null, if the base
371    * implementation is working
372    * @see #setWhitespaceHandler
373    */

374   public de.susebox.jtopas.spi.WhitespaceHandler getWhitespaceHandler();
375   
376   
377   /**
378    * Setting a new {@link de.susebox.jtopas.spi.SeparatorHandler} or removing any
379    * previously installed <code>SeparatorHandler</code>. If <code>null</code> is
380    * passed, the tokenizer doesn't recognize separators.
381    *<br>
382    * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
383    * implement the {@link de.susebox.jtopas.spi.SeparatorHandler} interface. If
384    * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code>
385    * instance as its <code>SeparatorHandler</code>. A different handler or a
386    * handler specific to a certain <code>Tokenizer</code> instance, can be set
387    * using this method.
388    *
389    * @param handler the (new) separator handler to use or <code>null</code> to
390    * remove it
391    * @see #getSeparatorHandler
392    * @see TokenizerProperties#setSeparators
393    */

394   public void setSeparatorHandler(de.susebox.jtopas.spi.SeparatorHandler handler);
395   
396   /**
397    * Retrieving the current {@link de.susebox.jtopas.spi.SeparatorHandler}. The
398    * method may return <code>null</code> if there isn't any handler installed.
399    *
400    * @return the currently active {@link de.susebox.jtopas.spi.SeparatorHandler}
401    * or <code>null</code>, if separators aren't recognized by the tokenizer
402    * @see #setSeparatorHandler
403    */

404   public de.susebox.jtopas.spi.SeparatorHandler getSeparatorHandler();
405   
406   
407   /**
408    * Setting a new {@link de.susebox.jtopas.spi.SequenceHandler} or removing any
409    * previously installed one. If <code>null</code> is passed, the tokenizer will
410    * not recognize line and block comments, strings and special sequences.
411    *<br>
412    * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
413    * implement the {@link de.susebox.jtopas.spi.SequenceHandler} interface. If
414    * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code>
415    * instance as its <code>SeparatorHandler</code>. A different handler or a
416    * handler specific to a certain <code>Tokenizer</code> instance, can be set
417    * using this method.
418    *
419    * @param handler the (new) {@link de.susebox.jtopas.spi.SequenceHandler} to
420    * use or <code>null</code> to remove it
421    * @see #getSequenceHandler
422    * @see TokenizerProperties#addSpecialSequence
423    * @see TokenizerProperties#addLineComment
424    * @see TokenizerProperties#addBlockComment
425    * @see TokenizerProperties#addString
426    */

427   public void setSequenceHandler(de.susebox.jtopas.spi.SequenceHandler handler);
428   
429   /**
430    * Retrieving the current {@link de.susebox.jtopas.spi.SequenceHandler}. The
431    * method may return <code>null</code> if there isn't any handler installed.
432    *<br>
433    * A <code>SequenceHandler</code> deals with line and block comments, strings
434    * and special sequences.
435    *
436    * @return the currently active {@link de.susebox.jtopas.spi.SequenceHandler}
437    * or <code>null</code>, if no
438    * @see #setSequenceHandler
439    */

440   public de.susebox.jtopas.spi.SequenceHandler getSequenceHandler();
441   
442   
443   /**
444    * Setting a new {@link de.susebox.jtopas.spi.PatternHandler} or removing any
445    * previously installed one. If <code>null</code> is passed, pattern are not
446    * supported by the tokenizer (any longer).
447    *<br>
448    * Usually, the {@link TokenizerProperties} used by a <code>Tokenizer</code>
449    * implement the {@link de.susebox.jtopas.spi.PatternHandler} interface. If
450    * so, the <code>Tokenizer</code> object sets the <code>TokenizerProperties</code>
451    * instance as its <code>PatternHandler</code>. A different handler or a
452    * handler specific to a certain <code>Tokenizer</code> instance, can be set
453    * using this method.
454    *
455    * @param handler the (new) {@link de.susebox.jtopas.spi.PatternHandler} to
456    * use or <code>null</code> to remove it
457    * @see #getPatternHandler
458    * @see TokenizerProperties#addPattern
459    */

460   public void setPatternHandler(de.susebox.jtopas.spi.PatternHandler handler);
461   
462   /**
463    * Retrieving the current {@link de.susebox.jtopas.spi.PatternHandler}. The method
464    * may return <code>null</code> if there isn't any handler installed.
465    *
466    * @return the currently active {@link de.susebox.jtopas.spi.PatternHandler}
467    * or <code>null</code>, if patterns are not recognized by the tokenizer
468    * @see #setPatternHandler
469    */

470   public de.susebox.jtopas.spi.PatternHandler getPatternHandler();
471   
472
473   //---------------------------------------------------------------------------
474
// tokenizer operations
475
//
476

477   /**
478    * Check if there are more tokens available. This method will return
479    * <code>true</code> until and enf-of-file condition is encountered during a
480    * call to {@link #nextToken} or {@link #nextImage}.
481    *<br>
482    * That means, that the EOF is returned one time, afterwards <code>hasMoreToken</code>
483    * will return <code>false</code>. Furthermore, that implies, that the method
484    * will return <code>true</code> at least once, even if the input data stream
485    * is empty.
486    *<br>
487    * The method can be conveniently used in a while loop.
488    *
489    * @return <code>true</code> if a call to {@link #nextToken} or {@link #nextImage}
490    * will succed, <code>false</code> otherwise
491    */

492   public boolean hasMoreToken();
493   
494   /**
495    * Retrieving the next {@link Token}. The method works in this order:
496    *<ol><li>
497    * Check for an end-of-file condition. If there is such a condition then
498    * return it.
499    *</li><li>
500    * Try to collect a sequence of whitespaces. If such a sequence can be found
501    * return if the flag <code>F_RETURN_WHITESPACES</code> is set, or skip these
502    * whitespaces.
503    *</li><li>
504    * Check the next characters against all known pattern. A pattern is usually
505    * a regular expression that is used by {@link java.util.regex.Pattern}. But
506    * implementations of {@link de.susebox.jtopas.spi.PatternHandler} may use
507    * other pattern syntaxes. Note that pattern are not recognized within
508    * "normal" text (see below for a more precise description).
509    *</li><li>
510    * Check the next characters against all known line and block comments. If
511    * a line or block comment starting sequence matches, return if the flag
512    * <code>F_RETURN_WHITESPACES</code> is set, or skip the comment.
513    * If comments are returned they include their starting and ending sequences
514    * (newline in case of a line comment).
515    *</li><li>
516    * Check the next characters against all known string starting sequences. If
517    * a string begin could be identified return the string until and including
518    * the closing sequence.
519    *</li><li>
520    * Check the next characters against all known special sequences. Especially,
521    * find the longest possible match. If a special sequence could be identified
522    * then return it.
523    *</li><li>
524    * Check for ordinary separators. If one could be found return it.
525    *</li><li>
526    * Check the next characters against all known keywords. If a keyword could
527    * be identified then return it.
528    *</li><li>
529    * Return the text portion until the next whitespace, comment, special
530    * sequence or separator. Note that pattern are not recognized within "normal"
531    * text. A pattern match has therefore always a whitespace, comment, special
532    * sequence, separator or another pattern match in front of it or starts at
533    * position 0 of the data.
534    *</li></ol>
535    * The method will return the EOF token as long as {@link #hasMoreToken} returns
536    * <code>false</code>. It will not return <code>null</code> in such conditions.
537    *
538    * @return found {@link Token} including the EOF token
539    * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
540    * (IOExceptions for instance)
541    * @see #nextImage
542    */

543   public Token nextToken() throws TokenizerException;
544  
545   /**
546    * This method is a convenience method. It returns only the next token image
547    * without any informations about its type or associated information. This is
548    * an especially usefull method, if the parse flags for this <code>Tokenizer</code>
549    * have the flag {@link TokenizerProperties#F_TOKEN_POS_ONLY} set, since this
550    * method returns a valid string even in that case.
551    *
552    * @return the token image of the next token
553    * @throws TokenizerException generic exception (list) for all problems that may occur while parsing
554    * (IOExceptions for instance)
555    * @see #nextToken
556    * @see #currentImage
557    */

558   public String JavaDoc nextImage() throws TokenizerException;
559  
560   /**
561    * Retrieve the {@link Token} that was found by the last call to {@link #nextToken}.
562    * or {@link #nextImage}.
563    *<br>
564    * Since version 0.6.1 of JTopas, this method throws a {@link TokenizerException}
565    * rather than returning <code>null</code> if neither {@link #nextToken} nor
566    * {@link #nextImage} have been called before or {@link #setReadPositionRelative}
567    * or {@link #setReadPositionAbsolute} habe been called after the last call to
568    * <code>nextToken</code> or <code>nextImage</code>.
569    *
570    * @return the {@link Token} retrieved by the last call to {@link #nextToken}.
571    * @throws TokenizerException if the tokenizer has no current token
572    * @see #nextToken
573    * @see #currentImage
574    */

575   public Token currentToken() throws TokenizerException;
576  
577   /**
578    * Convenience method to retrieve only the token image of the {@link Token} that
579    * would be returned by {@link #currentToken}. This is an especially usefull
580    * method, if the parse flags for this <code>Tokenizer</code> have the
581    * flag {@link TokenizerProperties#F_TOKEN_POS_ONLY} set, since this method
582    * returns a valid string even in that case.
583    *<br>
584    * Since version 0.6.1 of JTopas, this method throws a {@link TokenizerException}
585    * rather than returning <code>null</code> if neither {@link #nextToken} nor
586    * {@link #nextImage} have been called before or {@link #setReadPositionRelative}
587    * or {@link #setReadPositionAbsolute} habe been called after the last call to
588    * <code>nextToken</code> or <code>nextImage</code>.
589    *
590    * @return the token image of the current token
591    * @throws TokenizerException if the tokenizer has no current token
592    * @see #currentToken
593    * @see #nextImage
594    */

595   public String JavaDoc currentImage() throws TokenizerException;
596
597   
598   //---------------------------------------------------------------------------
599
// line and column positions
600
//
601

602   /**
603    * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method
604    * will return the line number starting with 0 in the input stream. The
605    * implementation of the <code>Tokenizer</code> interface can decide which
606    * end-of-line sequences should be recognized. The most flexible approach is
607    * to process the following end-of-line sequences:
608    * <br><ul><li>
609    * Carriage Return (ASCII 13, '\r'). This EOL is used on Apple Macintosh
610    * </li><li>
611    * Linefeed (ASCII 10, '\n'). This is the UNIX EOL character.
612    * </li><li>
613    * Carriage Return + Linefeed ("\r\n"). This is used on MS Windows systems.
614    * </li></ul>
615    * Another legitime and in many cases satisfying way is to use the system
616    * property "line.separator".
617    *<br>
618    * Displaying information about lines usually means adding 1 to the zero-based
619    * line number.
620    *
621    * @return the current line number starting with 0 or -1 if no line numbers
622    * are supplied ({@link TokenizerProperties#F_COUNT_LINES} is not set).
623    * @see #getColumnNumber
624    */

625   public int getLineNumber();
626   
627   /**
628    * If the flag {@link TokenizerProperties#F_COUNT_LINES} is set, this method
629    * will return the current column position starting with 0 in the input stream.
630    * Displaying information about columns usually means adding 1 to the zero-based
631    * column number.
632    *
633    * @return the current column position or -1 if the flag if no column numbers
634    * are supplied {@link TokenizerProperties#F_COUNT_LINES} is not set).
635    * is not set
636    * @see #getLineNumber
637    */

638   public int getColumnNumber();
639   
640   
641   //---------------------------------------------------------------------------
642
// text range operations
643
//
644

645   /**
646    * This method returns the absolute offset in characters to the start of the
647    * parsed stream. Together with {@link #currentlyAvailable} it describes the
648    * currently available text "window".
649    *<br>
650    * The position returned by this method and also by {@link #getReadPosition}
651    * are absolute rather than relative in a text buffer to give the tokenizer
652    * the full control of how and when to refill its text buffer.
653    *
654    * @return the absolute offset of the current text window in characters from
655    * the start of the data source of the Tokenizer
656    */

657   public int getRangeStart();
658   
659   /**
660    * Getting the current read offset. This is the absolute position where the
661    * next call to <code>nextToken</code> or <code>next</code> will start. It is
662    * therefore <b><k>not</k></b> the same as the position returned by
663    * {@link Token#getStartPosition} of the current token ({@link #currentToken}).
664    *<br>
665    * It is the starting position of the token returned by the next call to
666    * {@link #nextToken}, if that token is no whitespace or if whitespaces are
667    * returned ({@link TokenizerProperties#F_RETURN_WHITESPACES}).
668    *<br>
669    * The position returned by this method and also by {@link #getRangeStart}
670    * are absolute rather than relative in a text buffer to give the tokenizer
671    * the full control of how and when to refill its text buffer.
672    *
673    * @return the absolute offset in characters from the start of the data source
674    * of the Tokenizer where reading will be continued
675    */

676   public int getReadPosition();
677   
678   /**
679    * Retrieving the number of the currently available characters. This includes
680    * both characters already parsed by the <code>Tokenizer</code> and characters
681    * still to be analyzed.<br>
682    *
683    * @return number of currently available characters
684    */

685   public int currentlyAvailable();
686   
687   /**
688    * Retrieve text from the currently available range. The start and length
689    * parameters must be inside {@link #getRangeStart} and
690    * {@link #getRangeStart} + {@link #currentlyAvailable}.
691    *<br>
692    * Example:
693    *<block><pre>
694    * int startPos = tokenizer.getReadPosition();
695    * String source;
696    *
697    * while (tokenizer.hasMoreToken()) {
698    * Token token = tokenizer.nextToken();
699    *
700    * switch (token.getType()) {
701    * case Token.LINE_COMMENT:
702    * case Token.BLOCK_COMMENT:
703    * source = tokenizer.getText(startPos, token.getStartPos() - startPos);
704    * startPos = token.getStartPos();
705    * }
706    * }
707    *</pre></block>
708    *
709    * @param start position where the text begins
710    * @param length length of the text
711    * @return the text beginning at the given position ith the given length
712    * @throws IndexOutOfBoundsException if the starting position or the length is
713    * out of the current text window
714    */

715   public String JavaDoc getText(int start, int length) throws IndexOutOfBoundsException JavaDoc;
716   
717   /**
718    * Get a single character from the current text range.
719    *
720    * @param pos position of the required character
721    * @return the character at the specified position
722    * @throws IndexOutOfBoundsException if the parameter <code>pos</code> is not
723    * in the available text range (text window)
724    */

725   public char getChar(int pos) throws IndexOutOfBoundsException JavaDoc;
726   
727   /**
728    * Try to read more data into the text buffer of the tokenizer. This can be
729    * useful when a method needs to look ahead of the available data or a skip
730    * operation should be performed.
731    *<br>
732    * The method returns the same value than an immediately following call to
733    * {@link #currentlyAvailable} would return.
734    *
735    * @return the number of character now available
736    * @throws TokenizerException generic exception (list) for all problems that
737    * may occur while reading (IOExceptions for instance)
738    */

739   public int readMore() throws TokenizerException;
740   
741   /**
742    * This method sets the tokenizers current read position to the given absolute
743    * read position. It realizes one type of rewind / forward operations. The
744    * given position must be inside the intervall {@link #getRangeStart} and
745    * {@link #getRangeStart} + {@link #currentlyAvailable} - 1.
746    *<br>
747    * The current read position is the end position of the current token. That means
748    * that the following assertion can be made:
749    *<pre>
750    * Token token1 = tokenizer.nextToken();
751    * tokenizer.setReadPositionAbsolute(tokenizer.getReadPosition() - token1.getLength());
752    * Token token2 = tokenizer.nextToken();
753    * assert(token1.equals(token2));
754    *</pre>
755    *<br>
756    * Since JTopas version 0.6.1, the operation clears the current token. Therefore,
757    * {@link #currentImage} and {@link #currentToken} will throw a {@link TokenizerException}
758    * if called after a <code>setReadPositionAbsolute</code> without a subsequent
759    * call to {@link #nextToken} of {@link #nextImage}.
760    *
761    * @param position absolute position for the next parse operation
762    * @throws IndexOutOfBoundsException if the parameter <code>position</code> is
763    * not in the available text range (text window)
764    * @see #setReadPositionRelative
765    */

766   public void setReadPositionAbsolute(int position) throws IndexOutOfBoundsException JavaDoc;
767   
768   /**
769    * This method sets the tokenizers new read position the given number of characters
770    * forward (positive value) or backward (negative value) starting from the current
771    * read position. It realizes one type of rewind / forward operations. The
772    * given offset must be greater or equal than {@link #getRangeStart} - {@link #getReadPosition}
773    * and lower than {@link #currentlyAvailable} - {@link #getReadPosition}.
774    *<br>
775    * Since JTopas version 0.6.1, the operation clears the current token. Therefore,
776    * {@link #currentImage} and {@link #currentToken} will throw a {@link TokenizerException}
777    * if called after a <code>setReadPositionAbsolute</code> without a subsequent
778    * call to {@link #nextToken} of {@link #nextImage}.
779    *
780    * @param offset number of characters to move forward (positive offset) or
781    * backward (negative offset)
782    * @throws IndexOutOfBoundsException if the parameter <code>offset</code> would
783    * move the read position out of the available text range (text window)
784    * @see #setReadPositionAbsolute
785    */

786   public void setReadPositionRelative(int offset) throws IndexOutOfBoundsException JavaDoc;
787
788
789   //---------------------------------------------------------------------------
790
// Cleanup
791
//
792

793   /**
794    * This method is nessecary to release memory and remove object references if
795    * a <code>Tokenizer</code> instances are frequently created for small tasks.
796    * Generally, the method shouldn't throw any exceptions. It is also ok to call
797    * it more than once.
798    *<br>
799    * It is an error, to call any other method of the implementing class after
800    * <code>close</code> has been called.
801    */

802   public void close();
803 }
804
Popular Tags