StringTokenizer


1   /*
2    * A replacement for java.util.StringTokenizer
3    * Copyright (C) 2001 Stephen Ostermiller
4    * http://ostermiller.org/contact.pl?regarding=Java+Utilities
5    *
6    * This program is free software; you can redistribute it and/or modify
7    * it under the terms of the GNU General Public License as published by
8    * the Free Software Foundation; either version 2 of the License, or
9    * (at your option) any later version.
10   *
11   * This program is distributed in the hope that it will be useful,
12   * but WITHOUT ANY WARRANTY; without even the implied warranty of
13   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   * GNU General Public License for more details.
15   *
16   * See COPYING.TXT for details.
17   */
18  
19   package com.Ostermiller.util;
20  
21  /**
22   * The string tokenizer class allows an application to break a string into
23   * tokens.
24   * More information about this class is available from <a target="_top" HREF=
25   * "http://ostermiller.org/utils/StringTokenizer.html">ostermiller.org</a>.
26   * <p>
27   * The tokenization method is much simpler than the one used by the
28   * <code>StreamTokenizer</code> class. The <code>StringTokenizer</code> methods
29   * do not distinguish among identifiers, numbers, and quoted strings, nor do
30   * they recognize and skip comments.
31   * <p>
32   * The set of delimiters (the characters that separate tokens) may be specified
33   * either at creation time or on a per-token basis.
34   * <p>
35   * There are two kinds of delimiters: token delimiters and nontoken delimiters.
36   * A token is either one token delimiter character, or a maximal sequence of
37   * consecutive characters that are not delimiters.
38   * <p>
39   * A <code>StringTokenizer</code> object internally maintains a current
40   * position within the string to be tokenized. Some operations advance this
41   * current position past the characters processed.
42   * <p>
43   * The implementation is not thread safe; if a <code>StringTokenizer</code>
44   * object is intended to be used in multiple threads, an appropriate wrapper
45   * must be provided.
46   * <p>
47   * The following is one example of the use of the tokenizer. It also
48   * demonstrates the usefulness of having both token and nontoken delimiters in
49   * one <code>StringTokenizer</code>.
50   * <p>
51   * The code:
52   * <blockquote><code>
53   * String s = " &nbsp;( &nbsp; aaa  \t &nbsp;* (b+c1 ))";<br>
54   * StringTokenizer st = new StringTokenizer(s, " \t\n\r\f", "()+*");<br>
55   * while (st.hasMoreTokens()) {<br>
56   * &nbsp;&nbsp;&nbsp;&nbsp;System.out.println(st.nextToken());<br>
57   * };
58   * </code></blockquote>
59   * <p>
60   * prints the following output:
61   * <blockquote>
62   * (<br>
63   * aaa<br>
64   * *<br>
65   * (<br>
66   * b<br>
67   * +<br>
68   * c1<br>
69   * )<br>
70   * )
71   * </blockquote>
72   * <p>
73   * </b>Compatibility with <code>java.util.StringTokenizer</code></b>
74   * <p>
75   * In the original version of <code>java.util.StringTokenizer</code>, the method
76   * <code>nextToken()</code> left the current position after the returned token,
77   * and the method <code>hasMoreTokens()</code> moved (as a side effect) the
78   * current position before the beginning of the next token. Thus, the code:
79   * <blockquote><code>
80   * String s = "x=a,b,c";<br>
81   * java.util.StringTokenizer st = new java.util.StringTokenizer(s,"=");<br>
82   * System.out.println(st.nextToken());<br>
83   * while (st.hasMoreTokens()) {<br>
84   * &nbsp;&nbsp;&nbsp;&nbsp;System.out.println(st.nextToken(","));<br>
85   * };
86   * </code></blockquote>
87   * <p>
88   * prints the following output:
89   * <blockquote>
90   * x<br>
91   * a<br>
92   * b<br>
93   * c
94   * </blockquote>
95   * <p>
96   * The Java SDK 1.3 implementation removed the undesired side effect of
97   * <code>hasMoreTokens</code> method: now, it does not advance current position.
98   * However, after these changes the output of the above code was:
99   * <blockquote>
100  * x<br>
101  * =a<br>
102  * b<br>
103  * c
104  * </blockquote>
105  * <p>
106  * and there was no good way to produce a second token without "=".
107  * <p>
108  * To solve the problem, this implementation introduces a new method
109  * <code>skipDelimiters()</code>. To produce the original output, the above code
110  * should be modified as:
111  * <blockquote><code>
112  * String s = "x=a,b,c";<br>
113  * StringTokenizer st = new StringTokenizer(s,"=");<br>
114  * System.out.println(st.nextToken());<br>
115  * st.skipDelimiters();<br>
116  * while (st.hasMoreTokens()) {<br>
117  * &nbsp;&nbsp;&nbsp;&nbsp;System.out.println(st.nextToken(","));<br>
118  * };
119  * </code></blockquote>
120  *
121  * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
122  * @since ostermillerutils 1.00.00
123  */
124 public class StringTokenizer implements java.util.Enumeration  , java.util.Iterator   {
125 
126     /**
127      * The string to be tokenized.
128      * The code relies on this to never be null.
129      *
130      * @since ostermillerutils 1.00.00
131      */
132     protected String   text;
133 
134     /**
135      * The length of the text.
136      * Cached for performance.  This should be set whenever the
137      * string we are working with is changed.
138      *
139      * @since ostermillerutils 1.00.00
140      */
141     protected int strLength;
142 
143     /**
144      * The set of nontoken delimiters.
145      *
146      * @since ostermillerutils 1.00.00
147      */
148     protected String   nontokenDelims;
149 
150     /**
151      * The set of token delimiters.
152      *
153      * @since ostermillerutils 1.00.00
154      */
155     protected String   tokenDelims;
156 
157     /**
158      * One of two variables used to maintain state through
159      * the tokenizing process.
160      * <P>
161      * Represents the position at which we should start looking for
162      * the next token(the position of the character immediately
163      * following the end of the last token, or 0 to start), or
164      * -1 if the entire string has been examined.
165      *
166      * @since ostermillerutils 1.00.00
167      */
168     protected int position;
169 
170     /**
171      * One of two variables used to maintain state through
172      * the tokenizing process.
173      * <p>
174      * true if and only if is found that an empty token should
175      * be returned or if empty token was the last thing returned.
176      * <p>
177      * If returnEmptyTokens in false, then this variable will
178      * always be false.
179      *
180      * @since ostermillerutils 1.00.00
181      */
182     protected boolean emptyReturned;
183 
184     /**
185      * Stores the value of the delimiter character with the
186      * highest value. It is used to optimize the detection of delimiter
187      * characters.  The common case will be that the int values of delimiters
188      * will be less than that of most characters in the string (, or space less
189      * than any letter for example).  Given this, we can check easily check
190      * to see if a character is not a delimiter by comparing it to the max
191      * delimiter.  If it is greater than the max delimiter, then it is no
192      * a delimiter otherwise we have to do some more in depth analysis. (ie
193      * search the delimiter string.)  This will reduce the running time of
194      * the algorithm not to depend on the length of the delimiter string
195      * for the common case.
196      *
197      * @since ostermillerutils 1.00.00
198      */
199     protected char maxDelimChar;
200 
201     /**
202      * Whether empty tokens should be returned.
203      * ie if "" should be returned when text starts with
204      * a delim, has two delims next to each other, or
205      * ends with a delim.
206      *
207      * @since ostermillerutils 1.00.00
208      */
209     protected boolean returnEmptyTokens;
210 
211     /**
212      * Indicates at which position the delimiters last changed.  This
213      * will effect how null tokens are returned.  Any
214      * time that delimiters are changed, the string will be treated as if
215      * it is being parsed from position zero, ie, null strings are possible
216      * at the very beginning.
217      *
218      * @since ostermillerutils 1.00.00
219      */
220     protected int delimsChangedPosition;
221 
222     /**
223      * A cache of the token count.  This variable should be -1 if the token
224      * have not yet been counted. It should be greater than or equal to zero
225      * if the tokens have been counted.
226      *
227      * @since ostermillerutils 1.00.00
228      */
229     protected int tokenCount;
230 
231     /**
232      * Constructs a string tokenizer for the specified string. Both token and
233      * nontoken delimiters are specified.
234      * <p>
235      * The current position is set at the beginning of the string.
236      *
237      * @param text a string to be parsed.
238      * @param nontokenDelims the nontoken delimiters, i.e. the delimiters that only separate
239      *     tokens and are not returned as separate tokens.
240      * @param tokenDelims the token delimiters, i.e. delimiters that both separate tokens,
241      *     and are themselves returned as tokens.
242      * @throws NullPointerException if text is null.
243      *
244      * @since ostermillerutils 1.00.00
245      */
246     public StringTokenizer(String   text, String   nontokenDelims, String   tokenDelims){
247         this(text, nontokenDelims, tokenDelims, false);
248     }
249 
250     /**
251      * Constructs a string tokenizer for the specified string. Both token and
252      * nontoken delimiters are specified and whether or not empty tokens are returned
253      * is specified.
254      * <p>
255      * Empty tokens are tokens that are between consecutive delimiters.
256      * <p>
257      * It is a primary constructor (i.e. all other constructors are defined in terms
258      * of it.)
259      * <p>
260      * The current position is set at the beginning of the string.
261      *
262      * @param text a string to be parsed.
263      * @param nontokenDelims the nontoken delimiters, i.e. the delimiters that only separate
264      *     tokens and are not returned as separate tokens.
265      * @param tokenDelims the token delimiters, i.e. delimiters that both separate tokens,
266      *     and are themselves returned as tokens.
267      * @param returnEmptyTokens true if empty tokens may be returned; false otherwise.
268      * @throws NullPointerException if text is null.
269      *
270      * @since ostermillerutils 1.00.00
271      */
272     public StringTokenizer(String   text, String   nontokenDelims, String   tokenDelims, boolean returnEmptyTokens){
273         setDelims(nontokenDelims, tokenDelims);
274         setText(text);
275         setReturnEmptyTokens(returnEmptyTokens);
276     }
277 
278     /**
279      * Constructs a string tokenizer for the specified string. Either token or
280      * nontoken delimiters are specified.
281      * <p>
282      * Is equivalent to:
283      * <ul>
284      * <li> If the third parameter is <code>false</code> --
285      *      <code>StringTokenizer(text,delims, null)</code>
286      * <li> If the third parameter is <code>true</code> --
287      *      <code>StringTokenizer(text, null ,delims)</code>
288      * </ul>
289      *
290      * @param text a string to be parsed.
291      * @param delims the delimiters.
292      * @param delimsAreTokens
293      *     flag indicating whether the second parameter specifies token or
294      *     nontoken delimiters: <code>false</code> -- the second parameter
295      *     specifies nontoken delimiters, the set of token delimiters is
296      *     empty; <code>true</code> -- the second parameter specifies token
297      *     delimiters, the set of nontoken delimiters is empty.
298      * @throws NullPointerException if text is null.
299      *
300      * @since ostermillerutils 1.00.00
301      */
302     public StringTokenizer(String   text, String   delims, boolean delimsAreTokens){
303         this(text, (delimsAreTokens ? null : delims), (delimsAreTokens ? delims : null));
304     }
305 
306     /**
307      * Constructs a string tokenizer for the specified string. The characters in the
308      * <code>nontokenDelims</code> argument are the delimiters for separating
309      * tokens. Delimiter characters themselves will not be treated as tokens.
310      * <p>
311      * Is equivalent to <code>StringTokenizer(text,nontokenDelims, null)</code>.
312      *
313      * @param text a string to be parsed.
314      * @param nontokenDelims the nontoken delimiters.
315      * @throws NullPointerException if text is null.
316      *
317      * @since ostermillerutils 1.00.00
318      */
319     public StringTokenizer(String   text, String   nontokenDelims){
320         this(text, nontokenDelims, null);
321     }
322 
323     /**
324      * Constructs a string tokenizer for the specified string. The tokenizer uses
325      * " \t\n\r\f" as a delimiter set of nontoken delimiters, and an empty token
326      * delimiter set.
327      * <p>
328      * Is equivalent to <code>StringTokenizer(text, " \t\n\r\f", null);
329      *
330      * @param text a string to be parsed.
331      * @throws NullPointerException if text is null.
332      *
333      * @since ostermillerutils 1.00.00
334      */
335     public StringTokenizer(String   text){
336         this(text, " \t\n\r\f", null);
337     }
338 
339     /**
340      * Set the text to be tokenized in this StringTokenizer.
341      * <p>
342      * This is useful when for StringTokenizer re-use so that new string tokenizers do no
343      * have to be created for each string you want to tokenizer.
344      * <p>
345      * The string will be tokenized from the beginning of the string.
346      *
347      * @param text a string to be parsed.
348      * @throws NullPointerException if text is null.
349      *
350      * @since ostermillerutils 1.00.00
351      */
352     public void setText(String   text){
353         if (text == null){
354             throw new NullPointerException  ();
355         }
356         this.text = text;
357         strLength = text.length();
358         emptyReturned = false;
359         // set the position to start evaluation to zero
360         // unless the string has no length, in which case
361         // the entire string has already been examined.
362         position = (strLength > 0 ? 0: -1);
363         // because the text was changed since the last time the delimiters
364         // were changed we need to set the delimiter changed position
365         delimsChangedPosition = 0;
366         // The token count changes when the text changes
367         tokenCount = -1;
368     }
369 
370     /**
371      * Set the delimiters for this StringTokenizer.
372      * The position must be initialized before this method is used.
373      * (setText does this and it is called from the constructor)
374      *
375      * @param nontokenDelims delimiters that should not be returned as tokens.
376      * @param tokenDelims delimiters that should be returned as tokens.
377      *
378      * @since ostermillerutils 1.00.00
379      */
380     private void setDelims(String   nontokenDelims, String   tokenDelims){
381         this.nontokenDelims = nontokenDelims;
382         this.tokenDelims = tokenDelims;
383         // If we change delimiters, we do not want to start fresh,
384         // without returning empty tokens.
385         // the delimiter changed position can never be less than
386         // zero, unlike position.
387         delimsChangedPosition = (position != -1 ? position : strLength);
388         // set the max delimiter
389         maxDelimChar = 0;
390         for (int i=0; nontokenDelims != null && i < nontokenDelims.length(); i++){
391             if (maxDelimChar < nontokenDelims.charAt(i)){
392                 maxDelimChar = nontokenDelims.charAt(i);
393             }
394         }
395         for (int i=0; tokenDelims != null && i < tokenDelims.length(); i++){
396             if (maxDelimChar < tokenDelims.charAt(i)){
397                 maxDelimChar = tokenDelims.charAt(i);
398             }
399         }
400         // Changing the delimiters may change the number of tokens
401         tokenCount = -1;
402     }
403 
404 
405     /**
406      * Tests if there are more tokens available from this tokenizer's string.
407      * If this method returns <tt>true</tt>, then a subsequent call to
408      * <tt>nextToken</tt> with no argument will successfully return a token.
409      * <p>
410      * The current position is not changed.
411      *
412      * @return <code>true</code> if and only if there is at least one token in the
413      *          string after the current position; <code>false</code> otherwise.
414      *
415      * @since ostermillerutils 1.00.00
416      */
417     public boolean hasMoreTokens(){
418 
419         // handle the easy case in which the number
420         // of tokens has been counted.
421         if (tokenCount == 0){
422             return false;
423         } else if (tokenCount > 0){
424             return true;
425         }
426 
427         // copy over state variables from the class to local
428         // variables so that the state of this object can be
429         // restored to the state that it was in before this
430         // method was called.
431         int savedPosition = position;
432         boolean savedEmptyReturned = emptyReturned;
433 
434         int workingPosition = position;
435         boolean workingEmptyReturned = emptyReturned;
436         boolean onToken = advancePosition();
437         while(position != workingPosition ||
438             emptyReturned != workingEmptyReturned){
439             if (onToken){
440                 // restore object state
441                 position = savedPosition;
442                 emptyReturned = savedEmptyReturned;
443                 return true;
444             }
445             workingPosition = position;
446             workingEmptyReturned = emptyReturned;
447             onToken = advancePosition();
448         }
449 
450         // restore object state
451         position = savedPosition;
452         emptyReturned = savedEmptyReturned;
453         return false;
454     }
455 
456     /**
457      * Returns the next token from this string tokenizer.
458      * <p>
459      * The current position is set after the token returned.
460      *
461      * @return the next token from this string tokenizer.
462      * @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
463      *
464      * @since ostermillerutils 1.00.00
465      */
466     public String   nextToken(){
467         int workingPosition = position;
468         boolean workingEmptyReturned = emptyReturned;
469         boolean onToken = advancePosition();
470         while(position != workingPosition ||
471             emptyReturned != workingEmptyReturned){
472             if (onToken){
473                 // returning a token decreases the token count
474                 tokenCount--;
475                 return (emptyReturned ? "" : text.substring(workingPosition, (position != -1) ? position : strLength));
476             }
477             workingPosition = position;
478             workingEmptyReturned = emptyReturned;
479             onToken = advancePosition();
480         }
481         throw new java.util.NoSuchElementException  ();
482     }
483 
484     /**
485      * Advances the current position so it is before the next token.
486      * <p>
487      * This method skips nontoken delimiters but does not skip
488      * token delimiters.
489      * <p>
490      * This method is useful when switching to the new delimiter sets (see the
491      * second example in the class comment.)
492      *
493      * @return <code>true</code> if there are more tokens, <code>false</code> otherwise.
494      *
495      * @since ostermillerutils 1.00.00
496      */
497     public boolean skipDelimiters(){
498         int workingPosition = position;
499         boolean workingEmptyReturned = emptyReturned;
500         boolean onToken = advancePosition();
501 
502         // skipping delimiters may cause the number of tokens to change
503         tokenCount = -1;
504 
505         while(position != workingPosition ||
506             emptyReturned != workingEmptyReturned){
507             if (onToken){
508                 // restore the state to just as it was before we found
509                 // this token and return
510                 position = workingPosition;
511                 emptyReturned = workingEmptyReturned;
512                 return true;
513             }
514             workingPosition = position;
515             workingEmptyReturned = emptyReturned;
516             onToken = advancePosition();
517         }
518 
519         // the end of the string was reached
520         // without finding any tokens
521         return false;
522     }
523 
524     /**
525      * Calculates the number of times that this tokenizer's <code>nextToken</code>
526      * method can be called before it generates an exception. The current position
527      * is not advanced.
528      *
529      * @return the number of tokens remaining in the string using the current
530      *    delimiter set.
531      *
532      * @see #nextToken()
533      * @since ostermillerutils 1.00.00
534      */
535     public int countTokens(){
536 
537         // return the cached token count if a cache
538         // is available.
539         if (this.tokenCount >=0){
540             return this.tokenCount;
541         }
542 
543         int tokenCount = 0;
544 
545         // copy over state variables from the class to local
546         // variables so that the state of this object can be
547         // restored to the state that it was in before this
548         // method was called.
549         int savedPosition = position;
550         boolean savedEmptyReturned = emptyReturned;
551 
552         int workingPosition = position;
553         boolean workingEmptyReturned = emptyReturned;
554         boolean onToken = advancePosition();
555         while(position != workingPosition ||
556             emptyReturned != workingEmptyReturned){
557             if (onToken){
558                 tokenCount++;
559             }
560             workingPosition = position;
561             workingEmptyReturned = emptyReturned;
562             onToken = advancePosition();
563         }
564 
565         // restore object state
566         position = savedPosition;
567         emptyReturned = savedEmptyReturned;
568 
569         // Save the token count in case this is called again
570         // so we wouldn't have to do so much work.
571         this.tokenCount = tokenCount;
572 
573         return tokenCount;
574     }
575 
576     /**
577      * Set the delimiters used to this set of (nontoken) delimiters.
578      *
579      * @param delims the new set of nontoken delimiters (the set of token delimiters will be empty).
580      *
581      * @since ostermillerutils 1.00.00
582      */
583     public void setDelimiters(String   delims){
584         setDelims(delims, null);
585     }
586 
587     /**
588      * Set the delimiters used to this set of delimiters.
589      *
590      * @param delims the new set of delimiters.
591      * @param delimsAreTokens flag indicating whether the first parameter specifies
592      *    token or nontoken delimiters: false -- the first parameter specifies nontoken
593      *    delimiters, the set of token delimiters is empty; true -- the first parameter
594      *    specifies token delimiters, the set of nontoken delimiters is empty.
595      *
596      * @since ostermillerutils 1.00.00
597      */
598     public void setDelimiters(String   delims, boolean delimsAreTokens){
599         setDelims((delimsAreTokens ? null : delims), (delimsAreTokens ? delims : null));
600     }
601 
602     /**
603      * Set the delimiters used to this set of delimiters.
604      *
605      * @param nontokenDelims the new set of nontoken delimiters.
606      * @param tokenDelims the new set of token delimiters.
607      *
608      * @since ostermillerutils 1.00.00
609      */
610     public void setDelimiters(String   nontokenDelims, String   tokenDelims){
611         setDelims(nontokenDelims, tokenDelims);
612     }
613 
614     /**
615      * Set the delimiters used to this set of delimiters.
616      *
617      * @param nontokenDelims the new set of nontoken delimiters.
618      * @param tokenDelims the new set of token delimiters.
619      * @param returnEmptyTokens true if empty tokens may be returned; false otherwise.
620      *
621      * @since ostermillerutils 1.00.00
622      */
623     public void setDelimiters(String   nontokenDelims, String   tokenDelims, boolean returnEmptyTokens){
624         setDelims(nontokenDelims, tokenDelims);
625         setReturnEmptyTokens(returnEmptyTokens);
626     }
627 
628     /**
629      * Calculates the number of times that this tokenizer's <code>nextToken</code>
630      * method can be called before it generates an exception using the given set of
631      * (nontoken) delimiters.  The delimiters given will be used for future calls to
632      * nextToken() unless new delimiters are given. The current position
633      * is not advanced.
634      *
635      * @param delims the new set of nontoken delimiters (the set of token delimiters will be empty).
636      * @return the number of tokens remaining in the string using the new
637      *    delimiter set.
638      *
639      * @see #countTokens()
640      * @since ostermillerutils 1.00.00
641      */
642     public int countTokens(String   delims){
643         setDelims(delims, null);
644         return countTokens();
645     }
646 
647     /**
648      * Calculates the number of times that this tokenizer's <code>nextToken</code>
649      * method can be called before it generates an exception using the given set of
650      * delimiters.  The delimiters given will be used for future calls to
651      * nextToken() unless new delimiters are given. The current position
652      * is not advanced.
653      *
654      * @param delims the new set of delimiters.
655      * @param delimsAreTokens flag indicating whether the first parameter specifies
656      *    token or nontoken delimiters: false -- the first parameter specifies nontoken
657      *    delimiters, the set of token delimiters is empty; true -- the first parameter
658      *    specifies token delimiters, the set of nontoken delimiters is empty.
659      * @return the number of tokens remaining in the string using the new
660      *    delimiter set.
661      *
662      * @see #countTokens()
663      * @since ostermillerutils 1.00.00
664      */
665     public int countTokens(String   delims, boolean delimsAreTokens){
666         setDelims((delimsAreTokens ? null : delims), (delimsAreTokens ? delims : null));
667         return countTokens();
668     }
669 
670     /**
671      * Calculates the number of times that this tokenizer's <code>nextToken</code>
672      * method can be called before it generates an exception using the given set of
673      * delimiters.  The delimiters given will be used for future calls to
674      * nextToken() unless new delimiters are given. The current position
675      * is not advanced.
676      *
677      * @param nontokenDelims the new set of nontoken delimiters.
678      * @param tokenDelims the new set of token delimiters.
679      * @return the number of tokens remaining in the string using the new
680      *    delimiter set.
681      *
682      * @see #countTokens()
683      * @since ostermillerutils 1.00.00
684      */
685     public int countTokens(String   nontokenDelims, String   tokenDelims){
686         setDelims(nontokenDelims, tokenDelims);
687         return countTokens();
688     }
689 
690     /**
691      * Calculates the number of times that this tokenizer's <code>nextToken</code>
692      * method can be called before it generates an exception using the given set of
693      * delimiters.  The delimiters given will be used for future calls to
694      * nextToken() unless new delimiters are given. The current position
695      * is not advanced.
696      *
697      * @param nontokenDelims the new set of nontoken delimiters.
698      * @param tokenDelims the new set of token delimiters.
699      * @param returnEmptyTokens true if empty tokens may be returned; false otherwise.
700      * @return the number of tokens remaining in the string using the new
701      *    delimiter set.
702      *
703      * @see #countTokens()
704      * @since ostermillerutils 1.00.00
705      */
706     public int countTokens(String   nontokenDelims, String   tokenDelims, boolean returnEmptyTokens){
707         setDelims(nontokenDelims, tokenDelims);
708         setReturnEmptyTokens(returnEmptyTokens);
709         return countTokens();
710     }
711 
712     /**
713      * Advances the state of the tokenizer to the next token or delimiter.  This method only
714      * modifies the class variables position, and emptyReturned.  The type of token that
715      * should be emitted can be deduced by examining the changes to these two variables.
716      * If there are no more tokens, the state of these variables does not change at all.
717      *
718      * @return true if we are at a juncture at which a token may be emitted, false otherwise.
719      *
720      * @since ostermillerutils 1.00.00
721      */
722     private boolean advancePosition(){
723         // if we are returning empty tokens, we are just starting to tokenizer
724         // and there is a delimiter at the beginning of the string or the string
725         // is empty we need to indicate that there is an empty token at the beginning.
726         // The beginning is defined as where the delimiters were last changed.
727         if (returnEmptyTokens && !emptyReturned &&
728             (delimsChangedPosition == position ||
729             (position == -1 && strLength == delimsChangedPosition))){
730             if (strLength == delimsChangedPosition){
731                 // Case in which the string (since delim change)
732                 // is empty, but because we are returning empty
733                 // tokens, a single empty token should be returned.
734                 emptyReturned = true;
735                 /*System.out.println("Empty token for empty string.");*/
736                 return true;
737             } else {
738                 char c = text.charAt(position);
739                 if (c <= maxDelimChar &&
740                     (nontokenDelims != null && nontokenDelims.indexOf(c) != -1) ||
741                     (tokenDelims != null && tokenDelims.indexOf(c) != -1)){
742                     // There is delimiter at the very start of the string
743                     // so we must return an empty token at the beginning.
744                     emptyReturned = true;
745                     /*System.out.println("Empty token at beginning.");*/
746                     return true;
747                 }
748             }
749         }
750         // The main loop
751         // Do this as long as parts of the string have yet to be examined
752         while (position != -1){
753             char c = text.charAt(position);
754             if (returnEmptyTokens && !emptyReturned && position > delimsChangedPosition){
755                 char c1 = text.charAt(position - 1);
756                 // Examine the current character and the one before it.
757                 // If both of them are delimiters, then we need to return
758                 // an empty delimiter.  Note that characters that were examined
759                 // before the delimiters changed should not be reexamined.
760                 if (c <= maxDelimChar && c1 <= maxDelimChar &&
761                     ((nontokenDelims != null && nontokenDelims.indexOf(c) != -1) ||
762                     (tokenDelims != null && tokenDelims.indexOf(c) != -1)) &&
763                     ((nontokenDelims != null && nontokenDelims.indexOf(c1) != -1) ||
764                     (tokenDelims != null && tokenDelims.indexOf(c1) != -1))){
765                     emptyReturned = true;
766                     /*System.out.println("Empty token.");*/
767                     return true;
768                 }
769             }
770 
771             int nextDelimiter = (position < strLength - 1 ? indexOfNextDelimiter(position + 1) : -1);
772             if (c > maxDelimChar ||
773                 ((nontokenDelims == null || nontokenDelims.indexOf(c) == -1) &&
774                 (tokenDelims == null || tokenDelims.indexOf(c) == -1))){
775                 // token found
776                 /*System.out.println("Token: '" +
777                     text.substring(position, (nextDelimiter == -1 ? strLength : nextDelimiter)) +
778                     "' at " + position + ".");*/
779                 position = nextDelimiter;
780                 emptyReturned = false;
781                 return true;
782             } else if (tokenDelims != null && tokenDelims.indexOf(c) != -1) {
783                 // delimiter that can be returned as a token found
784                 emptyReturned = false;
785                 /*System.out.println("Delimiter: '" + c + "' at " + position + ".");*/
786                 position = (position < strLength -1 ? position +1 : -1);
787                 return true;
788             } else {
789                 // delimiter that is not a token found.
790                 emptyReturned = false;
791                 position = (position < strLength -1 ? position +1 : -1);
792                 return false;
793             }
794         }
795         // handle the case that a token is at the end of the string and we should
796         // return empty tokens.
797         if (returnEmptyTokens && !emptyReturned && strLength > 0){
798             char c = text.charAt(strLength - 1);
799             if (c <= maxDelimChar &&
800                 (nontokenDelims != null && nontokenDelims.indexOf(c) != -1) ||
801                 (tokenDelims != null && tokenDelims.indexOf(c) != -1)){
802                 // empty token at the end of the string found.
803                 emptyReturned = true;
804                 /*System.out.println("Empty token at end.");*/
805                 return true;
806             }
807         }
808         return false;
809     }
810 
811     /**
812      * Returns the next token in this string tokenizer's string.
813      * <p>
814      * First, the sets of token and nontoken delimiters are changed to be the
815      * <code>tokenDelims</code> and <code>nontokenDelims</code>, respectively.
816      * Then the next token (with respect to new delimiters) in the string after the
817      * current position is returned.
818      * <p>
819      * The current position is set after the token returned.
820      * <p>
821      * The new delimiter sets remains the used ones after this call.
822      *
823      * @param nontokenDelims the new set of nontoken delimiters.
824      * @param tokenDelims the new set of token delimiters.
825      * @return the next token, after switching to the new delimiter set.
826      * @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
827      * @see #nextToken()
828      *
829      * @since ostermillerutils 1.00.00
830      */
831     public String   nextToken(String   nontokenDelims, String   tokenDelims){
832         setDelims(nontokenDelims, tokenDelims);
833         return nextToken();
834     }
835 
836     /**
837      * Returns the next token in this string tokenizer's string.
838      * <p>
839      * First, the sets of token and nontoken delimiters are changed to be the
840      * <code>tokenDelims</code> and <code>nontokenDelims</code>, respectively;
841      * and whether or not to return empty tokens is set.
842      * Then the next token (with respect to new delimiters) in the string after the
843      * current position is returned.
844      * <p>
845      * The current position is set after the token returned.
846      * <p>
847      * The new delimiter set remains the one used for this call and empty tokens are
848      * returned in the future as they are in this call.
849      *
850      * @param nontokenDelims the new set of nontoken delimiters.
851      * @param tokenDelims the new set of token delimiters.
852      * @param returnEmptyTokens true if empty tokens may be returned; false otherwise.
853      * @return the next token, after switching to the new delimiter set.
854      * @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
855      * @see #nextToken()
856      *
857      * @since ostermillerutils 1.00.00
858      */
859     public String   nextToken(String   nontokenDelims, String   tokenDelims, boolean returnEmptyTokens){
860         setDelims(nontokenDelims, tokenDelims);
861         setReturnEmptyTokens(returnEmptyTokens);
862         return nextToken();
863     }
864 
865     /**
866      * Returns the next token in this string tokenizer's string.
867      * <p>
868      * Is equivalent to:
869      * <ul>
870      * <li> If the second parameter is <code>false</code> --
871      *      <code>nextToken(delims, null)</code>
872      * <li> If the second parameter is <code>true</code> --
873      *      <code>nextToken(null ,delims)</code>
874      * </ul>
875      * <p>
876      * @param delims the new set of token or nontoken delimiters.
877      * @param delimsAreTokens
878      *     flag indicating whether the first parameter specifies token or
879      *     nontoken delimiters: <code>false</code> -- the first parameter
880      *     specifies nontoken delimiters, the set of token delimiters is
881      *     empty; <code>true</code> -- the first parameter specifies token
882      *     delimiters, the set of nontoken delimiters is empty.
883      * @return the next token, after switching to the new delimiter set.
884      * @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
885      *
886      * @see #nextToken(String,String)
887      * @since ostermillerutils 1.00.00
888      */
889     public String   nextToken(String   delims, boolean delimsAreTokens){
890         return (delimsAreTokens ? nextToken(null, delims) : nextToken(delims, null));
891     }
892 
893     /**
894      * Returns the next token in this string tokenizer's string.
895      * <p>
896      * Is equivalent to <code>nextToken(delims, null)</code>.
897      *
898      * @param nontokenDelims the new set of nontoken delimiters (the set of
899      *     token delimiters will be empty).
900      * @return the next token, after switching to the new delimiter set.
901      * @throws NoSuchElementException if there are no more tokens in this
902      *     tokenizer's string.
903      *
904      * @see #nextToken(String,String)
905      * @since ostermillerutils 1.00.00
906      */
907     public String   nextToken(String   nontokenDelims){
908         return nextToken(nontokenDelims, null);
909     }
910 
911     /**
912      * Similar to String.indexOf(int, String) but will look for
913      * any character from string rather than the entire string.
914      *
915      * @param start index in text at which to begin the search
916      * @return index of the first delimiter from the start index (inclusive), or -1
917      *     if there are no more delimiters in the string
918      *
919      * @since ostermillerutils 1.00.00
920      */
921     private int indexOfNextDelimiter(int start){
922         char c;
923         int next;
924         for (next = start; (c = text.charAt(next)) > maxDelimChar ||
925             ((nontokenDelims == null || nontokenDelims.indexOf(c) == -1) &&
926             (tokenDelims == null || tokenDelims.indexOf(c) == -1)); next++){
927             if (next == strLength - 1){
928                 // we have reached the end of the string without
929                 // finding a delimiter
930                 return (-1);
931             }
932         }
933         return next;
934     }
935 
936     /**
937      * Returns the same value as the <code>hasMoreTokens()</code> method. It exists
938      * so that this class can implement the <code>Enumeration</code> interface.
939      *
940      * @return <code>true</code> if there are more tokens;
941      *    <code>false</code> otherwise.
942      *
943      * @see java.util.Enumeration
944      * @see #hasMoreTokens()
945      * @since ostermillerutils 1.00.00
946      */
947     public boolean hasMoreElements(){
948         return hasMoreTokens();
949     }
950 
951     /**
952      * Returns the same value as the <code>nextToken()</code> method, except that
953      * its declared return value is <code>Object</code> rather than
954      * <code>String</code>. It exists so that this class can implement the
955      * <code>Enumeration</code> interface.
956      *
957      * @return the next token in the string.
958      * @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
959      *
960      * @see java.util.Enumeration
961      * @see #nextToken()
962      * @since ostermillerutils 1.00.00
963      */
964     public Object   nextElement(){
965         return nextToken();
966     }
967 
968     /**
969      * Returns the same value as the <code>hasMoreTokens()</code> method. It exists
970      * so that this class can implement the <code>Iterator</code> interface.
971      *
972      * @return <code>true</code> if there are more tokens;
973      *     <code>false</code> otherwise.
974      *
975      * @see java.util.Iterator
976      * @see #hasMoreTokens()
977      * @since ostermillerutils 1.00.00
978      */
979     public boolean hasNext(){
980         return hasMoreTokens();
981     }
982 
983     /**
984      * Returns the same value as the <code>nextToken()</code> method, except that
985      * its declared return value is <code>Object</code> rather than
986      * <code>String</code>. It exists so that this class can implement the
987      * <code>Iterator</code> interface.
988      *
989      * @return the next token in the string.
990      * @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
991      *
992      * @see java.util.Iterator
993      * @see #nextToken()
994      * @since ostermillerutils 1.00.00
995      */
996     public Object   next(){
997         return nextToken();
998     }
999 
1000    /**
1001     * This implementation always throws <code>UnsupportedOperationException</code>.
1002     * It exists so that this class can implement the <code>Iterator</code> interface.
1003     *
1004     * @throws UnsupportedOperationException always is thrown.
1005     *
1006     * @see java.util.Iterator
1007     * @since ostermillerutils 1.00.00
1008     */
1009    public void remove(){
1010        throw new UnsupportedOperationException  ();
1011    }
1012
1013    /**
1014     * Set whether empty tokens should be returned from this point in
1015     * in the tokenizing process onward.
1016     * <P>
1017     * Empty tokens occur when two delimiters are next to each other
1018     * or a delimiter occurs at the beginning or end of a string. If
1019     * empty tokens are set to be returned, and a comma is the non token
1020     * delimiter, the following table shows how many tokens are in each
1021     * string.<br>
1022     * <table><tr><th>String<th><th>Number of tokens<th></tr>
1023     * <tr><td align=right>"one,two"<td><td>2 - normal case with no empty tokens.<td></tr>
1024     * <tr><td align=right>"one,,three"<td><td>3 including the empty token in the middle.<td></tr>
1025     * <tr><td align=right>"one,"<td><td>2 including the empty token at the end.<td></tr>
1026     * <tr><td align=right>",two"<td><td>2 including the empty token at the beginning.<td></tr>
1027     * <tr><td align=right>","<td><td>2 including the empty tokens at the beginning and the ends.<td></tr>
1028     * <tr><td align=right>""<td><td>1 - all strings will have at least one token if empty tokens are returned.<td></tr></table>
1029     *
1030     * @param returnEmptyTokens true iff empty tokens should be returned.
1031     *
1032     * @since ostermillerutils 1.00.00
1033     */
1034    public void setReturnEmptyTokens(boolean returnEmptyTokens){
1035        // this could effect the number of tokens
1036        tokenCount = -1;
1037        this.returnEmptyTokens = returnEmptyTokens;
1038    }
1039
1040    /**
1041     * Get the the index of the character immediately
1042     * following the end of the last token.  This is the position at which this tokenizer will begin looking
1043     * for the next token when a <code>nextToken()</code> method is invoked.
1044     *
1045     * @return the current position or -1 if the entire string has been tokenized.
1046     *
1047     * @since ostermillerutils 1.00.00
1048     */
1049    public int getCurrentPosition(){
1050        return this.position;
1051    }
1052
1053    /**
1054     * Retrieve all of the remaining tokens in a String array.
1055     * This method uses the options that are currently set for
1056     * the tokenizer and will advance the state of the tokenizer
1057     * such that <code>hasMoreTokens()</code> will return false.
1058     *
1059     * @return an array of tokens from this tokenizer.
1060     *
1061     * @since ostermillerutils 1.00.00
1062     */
1063    public String  [] toArray(){
1064        String  [] tokenArray = new String  [countTokens()];
1065        for(int i=0; hasMoreTokens(); i++) {
1066            tokenArray[i] = nextToken();
1067        }
1068        return tokenArray;
1069    }
1070
1071    /**
1072     * Retrieves the rest of the text as a single token.
1073     * After calling this method hasMoreTokens() will always return false.
1074     *
1075     * @return any part of the text that has not yet been tokenized.
1076     *
1077     * @since ostermillerutils 1.00.00
1078     */
1079    public String   restOfText(){
1080        return nextToken(null, null);
1081    }
1082
1083    /**
1084     * Returns the same value as nextToken() but does not alter
1085     * the internal state of the Tokenizer.  Subsequent calls
1086     * to peek() or a call to nextToken() will return the same
1087     * token again.
1088     *
1089     * @return the next token from this string tokenizer.
1090     * @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
1091     *
1092     * @since ostermillerutils 1.00.00
1093     */
1094    public String   peek(){
1095        // copy over state variables from the class to local
1096        // variables so that the state of this object can be
1097        // restored to the state that it was in before this
1098        // method was called.
1099        int savedPosition = position;
1100        boolean savedEmptyReturned = emptyReturned;
1101        int savedtokenCount = tokenCount;
1102
1103        // get the next token
1104        String   retval = nextToken();
1105
1106        // restore the state
1107        position = savedPosition;
1108        emptyReturned = savedEmptyReturned;
1109        tokenCount = savedtokenCount;
1110
1111        // return the nextToken;
1112        return(retval);
1113    }
1114}
1115
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags