StringTokenizer


1   /**
2   *******************************************************************************
3   * Copyright (C) 1996-2006, International Business Machines Corporation and    *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   
8   package com.ibm.icu.util;
9   
10  import java.util.Enumeration  ;
11  import java.util.NoSuchElementException  ;
12  import com.ibm.icu.text.UnicodeSet;
13  import com.ibm.icu.text.UTF16;
14  
15  /**
16   * <p>The string tokenizer class allows an application to break a string 
17   * into tokens by performing code point comparison. 
18   * The <code>StringTokenizer</code> methods do not distinguish 
19   * among identifiers, numbers, and quoted strings, nor do they recognize 
20   * and skip comments.</p>
21   * <p>
22   * The set of delimiters (the codepoints that separate tokens) may be 
23   * specified either at creation time or on a per-token basis. 
24   * </p>
25   * <p>
26   * An instance of <code>StringTokenizer</code> behaves in one of three ways, 
27   * depending on whether it was created with the <code>returnDelims</code> 
28   * and <code>coalesceDelims</code>
29   * flags having the value <code>true</code> or <code>false</code>: 
30   * <ul>
31   * <li>If returnDelims is <code>false</code>, delimiter code points serve to 
32   * separate tokens. A token is a maximal sequence of consecutive 
33   * code points that are not delimiters. 
34   * <li>If returnDelims is <code>true</code>, delimiter code points are 
35   * themselves considered to be tokens. In this case, if coalesceDelims is
36   * <code>true</code>, such tokens will be the maximal sequence of consecutive
37   * code points that <em>are</em> delimiters.  If coalesceDelims is false,
38   * a token will be received for each delimiter code point.
39   * </ul>
40   * <p>A token is thus either one 
41   * delimiter code point, a maximal sequence of consecutive code points that
42   * are delimiters, or a maximal sequence of consecutive code 
43   * points that are not delimiters.
44   * </p>
45   * <p>
46   * A <tt>StringTokenizer</tt> object internally maintains a current 
47   * position within the string to be tokenized. Some operations advance this 
48   * current position past the code point processed.
49   * </p>
50   * <p>
51   * A token is returned by taking a substring of the string that was used to 
52   * create the <tt>StringTokenizer</tt> object.
53   * </p>
54   * <p>
55   * Example of the use of the default delimiter tokenizer.
56   * <blockquote><pre>
57   * StringTokenizer st = new StringTokenizer("this is a test");
58   * while (st.hasMoreTokens()) {
59   *     println(st.nextToken());
60   *     }
61   * </pre></blockquote>
62   * </p>
63   * <p>
64   * prints the following output:
65   * <blockquote><pre>
66   *     this
67   *     is
68   *     a
69   *     test
70   * </pre></blockquote>
71   * </p>
72   * <p>
73   * Example of the use of the tokenizer with user specified delimiter.
74   * <blockquote><pre>
75   *     StringTokenizer st = new StringTokenizer(
76   *     "this is a test with supplementary characters &#92;ud800&#92;ud800&#92;udc00&#92;udc00",
77   *         " &#92;ud800&#92;udc00");
78   *     while (st.hasMoreTokens()) {
79   *         println(st.nextToken());
80   *     }
81   * </pre></blockquote>
82   * </p>
83   * <p>
84   * prints the following output:
85   * <blockquote><pre>
86   *     this
87   *     is
88   *     a
89   *     test
90   *     with
91   *     supplementary
92   *     characters
93   *     &#92;ud800
94   *     &#92;udc00
95   * </pre></blockquote>
96   * </p>
97   * @author syn wee
98   * @stable ICU 2.4
99   */
100 public final class StringTokenizer implements Enumeration   
101 {
102     // public constructors ---------------------------------------------
103      
104     /**
105      * <p>Constructs a string tokenizer for the specified string. All 
106      * characters in the delim argument are the delimiters for separating 
107      * tokens.</p> 
108      * <p>If the returnDelims flag is false, the delimiter characters are 
109      * skipped and only serve as separators between tokens.</p>
110      * <p>If the returnDelims flag is true, then the delimiter characters 
111      * are also returned as tokens, one per delimiter.
112      * @param str a string to be parsed.
113      * @param delim the delimiters.
114      * @param returndelims flag indicating whether to return the delimiters 
115      *        as tokens.
116      * @exception throws a NullPointerException if str is null
117      * @stable ICU 2.4
118      */
119     public StringTokenizer(String   str, UnicodeSet delim, boolean returndelims)
120     {
121         this(str, delim, returndelims, false);
122     }
123 
124     /**
125      * <p>Constructs a string tokenizer for the specified string. All 
126      * characters in the delim argument are the delimiters for separating 
127      * tokens.</p> 
128      * <p>If the returnDelims flag is false, the delimiter characters are 
129      * skipped and only serve as separators between tokens.</p>
130      * <p>If the returnDelims flag is true, then the delimiter characters 
131      * are also returned as tokens.  If coalescedelims is true, one token
132      * is returned for each run of delimiter characters, otherwise one
133      * token is returned per delimiter.  Since surrogate pairs can be
134      * delimiters, the returned token might be two chars in length.</p>
135      * @param str a string to be parsed.
136      * @param delim the delimiters.
137      * @param returndelims flag indicating whether to return the delimiters 
138      *        as tokens.
139      * @param coalescedelims flag indicating whether to return a run of 
140      *        delimiters as a single token or as one token per delimiter.  
141      *        This only takes effect if returndelims is true.
142      * @exception throws a NullPointerException if str is null
143      * @internal ICU 3.4.3
144      * @deprecated This API is ICU internal only.
145      */
146     public StringTokenizer(String   str, UnicodeSet delim, boolean returndelims, boolean coalescedelims)
147     {
148         m_source_ = str;
149         m_length_ = str.length();
150         if (delim == null) {
151             m_delimiters_ = EMPTY_DELIMITER_;
152         }
153         else {
154             m_delimiters_ = delim;   
155         }
156         m_returnDelimiters_ = returndelims;
157         m_coalesceDelimiters_ = coalescedelims;
158         m_tokenOffset_ = -1;
159         m_tokenSize_ = -1;
160         if (m_length_ == 0) {
161             // string length 0, no tokens
162             m_nextOffset_ = -1;
163         }
164         else {
165             m_nextOffset_ = 0;
166             if (!returndelims) {
167                 m_nextOffset_ = getNextNonDelimiter(0);
168             }
169         }
170     }
171     
172     /**
173      * <p>Constructs a string tokenizer for the specified string. The 
174      * characters in the delim argument are the delimiters for separating 
175      * tokens.</p> 
176      * <p>Delimiter characters themselves will not be treated as tokens.</p>
177      * @param str a string to be parsed.
178      * @param delim the delimiters.
179      * @exception throws a NullPointerException if str is null
180      * @stable ICU 2.4
181      */
182     public StringTokenizer(String   str, UnicodeSet delim)
183     {
184         this(str, delim, false, false);
185     }
186        
187     /**
188      * <p>Constructs a string tokenizer for the specified string. All 
189      * characters in the delim argument are the delimiters for separating 
190      * tokens.</p> 
191      * <p>If the returnDelims flag is false, the delimiter characters are 
192      * skipped and only serve as separators between tokens.</p>
193      * <p>If the returnDelims flag is true, then the delimiter characters 
194      * are also returned as tokens, one per delimiter.
195      * @param str a string to be parsed.
196      * @param delim the delimiters.
197      * @param returndelims flag indicating whether to return the delimiters 
198      *        as tokens.
199      * @exception throws a NullPointerException if str is null
200      * @stable ICU 2.4
201      */
202     public StringTokenizer(String   str, String   delim, boolean returndelims)
203     {
204         this(str, delim, returndelims, false); // java default behavior
205     }
206 
207     /**
208      * <p>Constructs a string tokenizer for the specified string. All 
209      * characters in the delim argument are the delimiters for separating 
210      * tokens.</p> 
211      * <p>If the returnDelims flag is false, the delimiter characters are 
212      * skipped and only serve as separators between tokens.</p>
213      * <p>If the returnDelims flag is true, then the delimiter characters 
214      * are also returned as tokens.  If coalescedelims is true, one token
215      * is returned for each run of delimiter characters, otherwise one
216      * token is returned per delimiter.  Since surrogate pairs can be
217      * delimiters, the returned token might be two chars in length.</p>
218      * @param str a string to be parsed.
219      * @param delim the delimiters.
220      * @param returndelims flag indicating whether to return the delimiters 
221      *        as tokens.
222      * @param coalescedelims flag indicating whether to return a run of 
223      *        delimiters as a single token or as one token per delimiter.  
224      *        This only takes effect if returndelims is true.
225      * @exception throws a NullPointerException if str is null
226      * @internal ICU 3.4.3
227      * @deprecated This API is ICU internal only.
228      */
229     public StringTokenizer(String   str, String   delim, boolean returndelims, boolean coalescedelims)
230     {
231         // don't ignore whitespace
232         m_delimiters_ = EMPTY_DELIMITER_;
233         if (delim != null && delim.length() > 0) {
234             m_delimiters_ = new UnicodeSet();
235             m_delimiters_.addAll(delim);
236             checkDelimiters();
237         }
238         m_coalesceDelimiters_ = coalescedelims;
239         m_source_ = str;
240         m_length_ = str.length();
241         m_returnDelimiters_ = returndelims;
242         m_tokenOffset_ = -1;
243         m_tokenSize_ = -1;
244         if (m_length_ == 0) {
245             // string length 0, no tokens
246             m_nextOffset_ = -1;
247         }
248         else {
249             m_nextOffset_ = 0;
250             if (!returndelims) {
251                 m_nextOffset_ = getNextNonDelimiter(0);
252             }
253         }
254     }
255     
256     /**
257      * <p>Constructs a string tokenizer for the specified string. The 
258      * characters in the delim argument are the delimiters for separating 
259      * tokens.</p> 
260      * <p>Delimiter characters themselves will not be treated as tokens.</p>
261      * @param str a string to be parsed.
262      * @param delim the delimiters.
263      * @exception throws a NullPointerException if str is null
264      * @stable ICU 2.4
265      */
266     public StringTokenizer(String   str, String   delim)
267     {
268         // don't ignore whitespace
269         this(str, delim, false, false);
270     }
271 
272     /**
273      * <p>Constructs a string tokenizer for the specified string. 
274      * The tokenizer uses the default delimiter set, which is 
275      * " &#92;t&#92;n&#92;r&#92;f": 
276      * the space character, the tab character, the newline character, the 
277      * carriage-return character, and the form-feed character.</p> 
278      * <p>Delimiter characters themselves will not be treated as tokens.</p>
279      * @param str a string to be parsed
280      * @exception throws a NullPointerException if str is null
281      * @stable ICU 2.4
282      */
283     public StringTokenizer(String   str) 
284     {
285         this(str, DEFAULT_DELIMITERS_, false, false);
286     }
287     
288     // public methods --------------------------------------------------
289     
290     /**
291      * Tests if there are more tokens available from this tokenizer's 
292      * string. 
293      * If this method returns <tt>true</tt>, then a subsequent call to 
294      * <tt>nextToken</tt> with no argument will successfully return a token.
295      * @return <code>true</code> if and only if there is at least one token 
296      *         in the string after the current position; <code>false</code> 
297      *         otherwise.
298      * @stable ICU 2.4
299      */
300     public boolean hasMoreTokens() 
301     {
302         return m_nextOffset_ >= 0;
303     }
304     
305     /**
306      * Returns the next token from this string tokenizer.
307      * @return the next token from this string tokenizer.
308      * @exception NoSuchElementException if there are no more tokens in 
309      *            this tokenizer's string.
310      * @stable ICU 2.4
311      */
312     public String   nextToken() 
313     {
314         if (m_tokenOffset_ < 0) {
315             if (m_nextOffset_ < 0) {
316                 throw new NoSuchElementException  ("No more tokens in String");   
317             }
318             // pre-calculations of tokens not done
319             if (m_returnDelimiters_) {
320                 int tokenlimit = 0;
321                 int c = UTF16.charAt(m_source_, m_nextOffset_);
322                 boolean contains = delims == null 
323                     ? m_delimiters_.contains(c) 
324                     : c < delims.length && delims[c];
325                 if (contains) {
326                      if (m_coalesceDelimiters_) {
327                         tokenlimit = getNextNonDelimiter(m_nextOffset_);
328                      } else {
329                         tokenlimit = m_nextOffset_ + UTF16.getCharCount(c);
330                         if (tokenlimit == m_length_) {
331                             tokenlimit = -1;
332                         }
333                      }
334                 }
335                 else {
336                     tokenlimit = getNextDelimiter(m_nextOffset_);
337                 }
338                 String   result;
339                 if (tokenlimit < 0) {
340                     result = m_source_.substring(m_nextOffset_);
341                 }
342                 else {
343                     result = m_source_.substring(m_nextOffset_, tokenlimit);
344                 }
345                 m_nextOffset_ = tokenlimit;
346                 return result;
347             }
348             else {
349                 int tokenlimit = getNextDelimiter(m_nextOffset_);
350                 String   result;
351                 if (tokenlimit < 0) {
352                     result = m_source_.substring(m_nextOffset_);
353                     m_nextOffset_ = tokenlimit;
354                 }
355                 else {
356                     result = m_source_.substring(m_nextOffset_, tokenlimit);
357                     m_nextOffset_ = getNextNonDelimiter(tokenlimit);
358                 }
359                 
360                 return result;
361             }
362         }
363         // count was called before and we have all the tokens
364         if (m_tokenOffset_ >= m_tokenSize_) {
365             throw new NoSuchElementException  ("No more tokens in String");
366         }
367         String   result;
368         if (m_tokenLimit_[m_tokenOffset_] >= 0) {
369             result = m_source_.substring(m_tokenStart_[m_tokenOffset_],
370                                          m_tokenLimit_[m_tokenOffset_]);
371         }
372         else {
373             result = m_source_.substring(m_tokenStart_[m_tokenOffset_]);
374         }
375         m_tokenOffset_ ++;
376         m_nextOffset_ = -1;
377         if (m_tokenOffset_ < m_tokenSize_) {
378             m_nextOffset_ = m_tokenStart_[m_tokenOffset_];
379         }
380         return result;
381     }
382     
383     /**
384      * Returns the next token in this string tokenizer's string. First, 
385      * the set of characters considered to be delimiters by this 
386      * <tt>StringTokenizer</tt> object is changed to be the characters in 
387      * the string <tt>delim</tt>. Then the next token in the string
388      * after the current position is returned. The current position is 
389      * advanced beyond the recognized token.  The new delimiter set 
390      * remains the default after this call. 
391      * @param delim the new delimiters.
392      * @return the next token, after switching to the new delimiter set.
393      * @exception NoSuchElementException if there are no more tokens in 
394      *            this tokenizer's string.
395      * @stable ICU 2.4
396      */
397     public String   nextToken(String   delim) 
398     {
399         m_delimiters_ = EMPTY_DELIMITER_;
400         if (delim != null && delim.length() > 0) {
401             m_delimiters_ = new UnicodeSet();
402             m_delimiters_.addAll(delim);
403         }
404         return nextToken(m_delimiters_);
405     }
406     
407     /**
408      * Returns the next token in this string tokenizer's string. First, 
409      * the set of characters considered to be delimiters by this 
410      * <tt>StringTokenizer</tt> object is changed to be the characters in 
411      * the string <tt>delim</tt>. Then the next token in the string
412      * after the current position is returned. The current position is 
413      * advanced beyond the recognized token.  The new delimiter set 
414      * remains the default after this call. 
415      * @param delim the new delimiters.
416      * @return the next token, after switching to the new delimiter set.
417      * @exception NoSuchElementException if there are no more tokens in 
418      *            this tokenizer's string.
419      * @stable ICU 2.4
420      */
421     public String   nextToken(UnicodeSet delim) 
422     {
423         m_delimiters_ = delim;
424         checkDelimiters();
425         m_tokenOffset_ = -1;
426         m_tokenSize_ = -1;
427         if (!m_returnDelimiters_) {
428             m_nextOffset_ = getNextNonDelimiter(m_nextOffset_);
429         }
430         return nextToken();
431     }
432     
433     /**
434      * Returns the same value as the <code>hasMoreTokens</code> method. 
435      * It exists so that this class can implement the 
436      * <code>Enumeration</code> interface. 
437      * @return <code>true</code> if there are more tokens;
438      *         <code>false</code> otherwise.
439      * @see #hasMoreTokens()
440      * @stable ICU 2.4
441      */
442     public boolean hasMoreElements() 
443     {
444         return hasMoreTokens();
445     }
446     
447     /**
448      * Returns the same value as the <code>nextToken</code> method, except 
449      * that its declared return value is <code>Object</code> rather than 
450      * <code>String</code>. It exists so that this class can implement the 
451      * <code>Enumeration</code> interface. 
452      * @return the next token in the string.
453      * @exception NoSuchElementException if there are no more tokens in 
454      *            this tokenizer's string.
455      * @see #nextToken()
456      * @stable ICU 2.4
457      */
458     public Object   nextElement() 
459     {
460         return nextToken();
461     }
462     
463     /**
464      * Calculates the number of times that this tokenizer's 
465      * <code>nextToken</code> method can be called before it generates an 
466      * exception. The current position is not advanced.
467      * @return the number of tokens remaining in the string using the 
468      *         current delimiter set.
469      * @see #nextToken()
470      * @stable ICU 2.4
471      */
472     public int countTokens() 
473     {
474         int result = 0;
475         if (hasMoreTokens()) {
476             if (m_tokenOffset_ >= 0) {
477                 return m_tokenSize_ - m_tokenOffset_;
478             }
479             if (m_tokenStart_ == null) {
480                 m_tokenStart_ = new int[TOKEN_SIZE_];
481                 m_tokenLimit_ = new int[TOKEN_SIZE_];
482             }
483             do {
484                 if (m_tokenStart_.length == result) {
485                     int temptokenindex[] = m_tokenStart_;
486                     int temptokensize[] = m_tokenLimit_;
487                     int originalsize = temptokenindex.length;
488                     int newsize = originalsize + TOKEN_SIZE_;
489                     m_tokenStart_ = new int[newsize];
490                     m_tokenLimit_ = new int[newsize];
491                     System.arraycopy(temptokenindex, 0, m_tokenStart_, 0, 
492                                      originalsize);
493                     System.arraycopy(temptokensize, 0, m_tokenLimit_, 0, 
494                                      originalsize);
495                 }
496                 m_tokenStart_[result] = m_nextOffset_;
497                 if (m_returnDelimiters_) {
498                     int c = UTF16.charAt(m_source_, m_nextOffset_);
499                     boolean contains = delims == null 
500                         ? m_delimiters_.contains(c) 
501                         : c < delims.length && delims[c];
502                     if (contains) {
503                         if (m_coalesceDelimiters_) {
504                             m_tokenLimit_[result] = getNextNonDelimiter(
505                                                                 m_nextOffset_);
506                         } else {
507                             int p = m_nextOffset_ + 1;
508                             if (p == m_length_) {
509                                 p = -1;
510                             }
511                             m_tokenLimit_[result] = p;
512 
513                         }
514                     }
515                     else {
516                         m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
517                     }
518                     m_nextOffset_ = m_tokenLimit_[result];
519                 }
520                 else {
521                     m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
522                     m_nextOffset_ = getNextNonDelimiter(m_tokenLimit_[result]);
523                 }
524                 result ++;
525             } while (m_nextOffset_ >= 0);
526             m_tokenOffset_ = 0;
527             m_tokenSize_ = result;
528             m_nextOffset_ = m_tokenStart_[0];
529         }
530         return result;
531     }
532     
533     // private data members -------------------------------------------------
534     
535     /**
536      * Current offset to the token array. If the array token is not set up yet,
537      * this value is a -1
538      */
539     private int m_tokenOffset_;
540     /**
541      * Size of the token array. If the array token is not set up yet,
542      * this value is a -1
543      */
544     private int m_tokenSize_;
545     /**
546      * Array of pre-calculated tokens start indexes in source string terminated 
547      * by -1.
548      * This is only set up during countTokens() and only stores the remaining
549      * tokens, not all tokens including parsed ones
550      */
551     private int m_tokenStart_[];
552     /**
553      * Array of pre-calculated tokens limit indexes in source string.
554      * This is only set up during countTokens() and only stores the remaining
555      * tokens, not all tokens including parsed ones
556      */
557     private int m_tokenLimit_[];
558     /**
559      * UnicodeSet containing delimiters
560      */
561     private UnicodeSet m_delimiters_;
562     /**
563      * String to parse for tokens
564      */
565     private String   m_source_;
566     /**
567      * Length of m_source_
568      */
569     private int m_length_;
570     /**
571      * Current position in string to parse for tokens
572      */
573     private int m_nextOffset_;
574     /**
575      * Flag indicator if delimiters are to be treated as tokens too
576      */
577     private boolean m_returnDelimiters_;
578 
579     /**
580      * Flag indicating whether to coalesce runs of delimiters into single tokens
581      */
582     private boolean m_coalesceDelimiters_;
583 
584     /**
585      * Default set of delimiters &#92;t&#92;n&#92;r&#92;f
586      */
587     private static final UnicodeSet DEFAULT_DELIMITERS_ 
588                                         = new UnicodeSet("[ \t\n\r\f]", false);
589     /**
590      * Array size increments
591      */
592     private static final int TOKEN_SIZE_ = 100;
593     /**
594      * A empty delimiter UnicodeSet, used when user specified null delimiters
595      */
596     private static final UnicodeSet EMPTY_DELIMITER_ = new UnicodeSet();
597     
598     // private methods ------------------------------------------------------
599     
600     /**
601      * Gets the index of the next delimiter after offset
602      * @param offset to the source string
603      * @return offset of the immediate next delimiter, otherwise 
604      *         (- source string length - 1) if there
605      *         are no more delimiters after m_nextOffset
606      */
607     private int getNextDelimiter(int offset)
608     {
609         if (offset >= 0) {
610             int result = offset; 
611             int c = 0;
612             if (delims == null) {
613                 do {
614                     c = UTF16.charAt(m_source_, result);
615                     if (m_delimiters_.contains(c)) {
616                         break;
617                     }
618                     result ++;
619                 } while (result < m_length_);
620             } else {
621                 do {
622                     c = UTF16.charAt(m_source_, result);
623                     if (c < delims.length && delims[c]) {
624                         break;
625                     }
626                     result ++;
627                 } while (result < m_length_);
628             }                
629             if (result < m_length_) {
630                 return result;
631             }
632         }
633         return -1 - m_length_;
634     }
635     
636     /**
637      * Gets the index of the next non-delimiter after m_nextOffset_
638      * @param offset to the source string
639      * @return offset of the immediate next non-delimiter, otherwise 
640      *         (- source string length - 1) if there
641      *         are no more delimiters after m_nextOffset
642      */
643     private int getNextNonDelimiter(int offset)
644     {
645         if (offset >= 0) {
646             int result = offset; 
647             int c = 0;
648             if (delims == null) {
649                 do {
650                     c = UTF16.charAt(m_source_, result);
651                     if (!m_delimiters_.contains(c)) {
652                         break;
653                     }
654                     result ++;
655                 } while (result < m_length_);
656             } else {
657                 do {
658                     c = UTF16.charAt(m_source_, result);
659                     if (!(c < delims.length && delims[c])) {
660                         break;
661                     }
662                     result ++;
663                 } while (result < m_length_);
664             }
665             if (result < m_length_) {
666                 return result;
667             }
668         }
669         return -1 - m_length_;
670     }
671 
672     void checkDelimiters() {
673         if (m_delimiters_ == null || m_delimiters_.size() == 0) {
674             delims = new boolean[0];
675         } else {
676             int maxChar = m_delimiters_.getRangeEnd(m_delimiters_.getRangeCount()-1);
677             if (maxChar < 0x7f) {
678                 delims = new boolean[maxChar+1];
679                 for (int i = 0, ch; -1 != (ch = m_delimiters_.charAt(i)); ++i) {
680                     delims[ch] = true;
681                 }
682             } else {
683                 delims = null;
684             }
685         }
686     }
687     private boolean[] delims;
688 }
689
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags