KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > util > StringTokenizer


1 /**
2 *******************************************************************************
3 * Copyright (C) 1996-2006, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
6 */

7
8 package com.ibm.icu.util;
9
10 import java.util.Enumeration JavaDoc;
11 import java.util.NoSuchElementException JavaDoc;
12 import com.ibm.icu.text.UnicodeSet;
13 import com.ibm.icu.text.UTF16;
14
15 /**
16  * <p>The string tokenizer class allows an application to break a string
17  * into tokens by performing code point comparison.
18  * The <code>StringTokenizer</code> methods do not distinguish
19  * among identifiers, numbers, and quoted strings, nor do they recognize
20  * and skip comments.</p>
21  * <p>
22  * The set of delimiters (the codepoints that separate tokens) may be
23  * specified either at creation time or on a per-token basis.
24  * </p>
25  * <p>
26  * An instance of <code>StringTokenizer</code> behaves in one of three ways,
27  * depending on whether it was created with the <code>returnDelims</code>
28  * and <code>coalesceDelims</code>
29  * flags having the value <code>true</code> or <code>false</code>:
30  * <ul>
31  * <li>If returnDelims is <code>false</code>, delimiter code points serve to
32  * separate tokens. A token is a maximal sequence of consecutive
33  * code points that are not delimiters.
34  * <li>If returnDelims is <code>true</code>, delimiter code points are
35  * themselves considered to be tokens. In this case, if coalesceDelims is
36  * <code>true</code>, such tokens will be the maximal sequence of consecutive
37  * code points that <em>are</em> delimiters. If coalesceDelims is false,
38  * a token will be received for each delimiter code point.
39  * </ul>
40  * <p>A token is thus either one
41  * delimiter code point, a maximal sequence of consecutive code points that
42  * are delimiters, or a maximal sequence of consecutive code
43  * points that are not delimiters.
44  * </p>
45  * <p>
46  * A <tt>StringTokenizer</tt> object internally maintains a current
47  * position within the string to be tokenized. Some operations advance this
48  * current position past the code point processed.
49  * </p>
50  * <p>
51  * A token is returned by taking a substring of the string that was used to
52  * create the <tt>StringTokenizer</tt> object.
53  * </p>
54  * <p>
55  * Example of the use of the default delimiter tokenizer.
56  * <blockquote><pre>
57  * StringTokenizer st = new StringTokenizer("this is a test");
58  * while (st.hasMoreTokens()) {
59  * println(st.nextToken());
60  * }
61  * </pre></blockquote>
62  * </p>
63  * <p>
64  * prints the following output:
65  * <blockquote><pre>
66  * this
67  * is
68  * a
69  * test
70  * </pre></blockquote>
71  * </p>
72  * <p>
73  * Example of the use of the tokenizer with user specified delimiter.
74  * <blockquote><pre>
75  * StringTokenizer st = new StringTokenizer(
76  * "this is a test with supplementary characters &#92;ud800&#92;ud800&#92;udc00&#92;udc00",
77  * " &#92;ud800&#92;udc00");
78  * while (st.hasMoreTokens()) {
79  * println(st.nextToken());
80  * }
81  * </pre></blockquote>
82  * </p>
83  * <p>
84  * prints the following output:
85  * <blockquote><pre>
86  * this
87  * is
88  * a
89  * test
90  * with
91  * supplementary
92  * characters
93  * &#92;ud800
94  * &#92;udc00
95  * </pre></blockquote>
96  * </p>
97  * @author syn wee
98  * @stable ICU 2.4
99  */

100 public final class StringTokenizer implements Enumeration JavaDoc
101 {
102     // public constructors ---------------------------------------------
103

104     /**
105      * <p>Constructs a string tokenizer for the specified string. All
106      * characters in the delim argument are the delimiters for separating
107      * tokens.</p>
108      * <p>If the returnDelims flag is false, the delimiter characters are
109      * skipped and only serve as separators between tokens.</p>
110      * <p>If the returnDelims flag is true, then the delimiter characters
111      * are also returned as tokens, one per delimiter.
112      * @param str a string to be parsed.
113      * @param delim the delimiters.
114      * @param returndelims flag indicating whether to return the delimiters
115      * as tokens.
116      * @exception throws a NullPointerException if str is null
117      * @stable ICU 2.4
118      */

119     public StringTokenizer(String JavaDoc str, UnicodeSet delim, boolean returndelims)
120     {
121         this(str, delim, returndelims, false);
122     }
123
124     /**
125      * <p>Constructs a string tokenizer for the specified string. All
126      * characters in the delim argument are the delimiters for separating
127      * tokens.</p>
128      * <p>If the returnDelims flag is false, the delimiter characters are
129      * skipped and only serve as separators between tokens.</p>
130      * <p>If the returnDelims flag is true, then the delimiter characters
131      * are also returned as tokens. If coalescedelims is true, one token
132      * is returned for each run of delimiter characters, otherwise one
133      * token is returned per delimiter. Since surrogate pairs can be
134      * delimiters, the returned token might be two chars in length.</p>
135      * @param str a string to be parsed.
136      * @param delim the delimiters.
137      * @param returndelims flag indicating whether to return the delimiters
138      * as tokens.
139      * @param coalescedelims flag indicating whether to return a run of
140      * delimiters as a single token or as one token per delimiter.
141      * This only takes effect if returndelims is true.
142      * @exception throws a NullPointerException if str is null
143      * @internal ICU 3.4.3
144      * @deprecated This API is ICU internal only.
145      */

146     public StringTokenizer(String JavaDoc str, UnicodeSet delim, boolean returndelims, boolean coalescedelims)
147     {
148         m_source_ = str;
149         m_length_ = str.length();
150         if (delim == null) {
151             m_delimiters_ = EMPTY_DELIMITER_;
152         }
153         else {
154             m_delimiters_ = delim;
155         }
156         m_returnDelimiters_ = returndelims;
157         m_coalesceDelimiters_ = coalescedelims;
158         m_tokenOffset_ = -1;
159         m_tokenSize_ = -1;
160         if (m_length_ == 0) {
161             // string length 0, no tokens
162
m_nextOffset_ = -1;
163         }
164         else {
165             m_nextOffset_ = 0;
166             if (!returndelims) {
167                 m_nextOffset_ = getNextNonDelimiter(0);
168             }
169         }
170     }
171     
172     /**
173      * <p>Constructs a string tokenizer for the specified string. The
174      * characters in the delim argument are the delimiters for separating
175      * tokens.</p>
176      * <p>Delimiter characters themselves will not be treated as tokens.</p>
177      * @param str a string to be parsed.
178      * @param delim the delimiters.
179      * @exception throws a NullPointerException if str is null
180      * @stable ICU 2.4
181      */

182     public StringTokenizer(String JavaDoc str, UnicodeSet delim)
183     {
184         this(str, delim, false, false);
185     }
186        
187     /**
188      * <p>Constructs a string tokenizer for the specified string. All
189      * characters in the delim argument are the delimiters for separating
190      * tokens.</p>
191      * <p>If the returnDelims flag is false, the delimiter characters are
192      * skipped and only serve as separators between tokens.</p>
193      * <p>If the returnDelims flag is true, then the delimiter characters
194      * are also returned as tokens, one per delimiter.
195      * @param str a string to be parsed.
196      * @param delim the delimiters.
197      * @param returndelims flag indicating whether to return the delimiters
198      * as tokens.
199      * @exception throws a NullPointerException if str is null
200      * @stable ICU 2.4
201      */

202     public StringTokenizer(String JavaDoc str, String JavaDoc delim, boolean returndelims)
203     {
204         this(str, delim, returndelims, false); // java default behavior
205
}
206
207     /**
208      * <p>Constructs a string tokenizer for the specified string. All
209      * characters in the delim argument are the delimiters for separating
210      * tokens.</p>
211      * <p>If the returnDelims flag is false, the delimiter characters are
212      * skipped and only serve as separators between tokens.</p>
213      * <p>If the returnDelims flag is true, then the delimiter characters
214      * are also returned as tokens. If coalescedelims is true, one token
215      * is returned for each run of delimiter characters, otherwise one
216      * token is returned per delimiter. Since surrogate pairs can be
217      * delimiters, the returned token might be two chars in length.</p>
218      * @param str a string to be parsed.
219      * @param delim the delimiters.
220      * @param returndelims flag indicating whether to return the delimiters
221      * as tokens.
222      * @param coalescedelims flag indicating whether to return a run of
223      * delimiters as a single token or as one token per delimiter.
224      * This only takes effect if returndelims is true.
225      * @exception throws a NullPointerException if str is null
226      * @internal ICU 3.4.3
227      * @deprecated This API is ICU internal only.
228      */

229     public StringTokenizer(String JavaDoc str, String JavaDoc delim, boolean returndelims, boolean coalescedelims)
230     {
231         // don't ignore whitespace
232
m_delimiters_ = EMPTY_DELIMITER_;
233         if (delim != null && delim.length() > 0) {
234             m_delimiters_ = new UnicodeSet();
235             m_delimiters_.addAll(delim);
236             checkDelimiters();
237         }
238         m_coalesceDelimiters_ = coalescedelims;
239         m_source_ = str;
240         m_length_ = str.length();
241         m_returnDelimiters_ = returndelims;
242         m_tokenOffset_ = -1;
243         m_tokenSize_ = -1;
244         if (m_length_ == 0) {
245             // string length 0, no tokens
246
m_nextOffset_ = -1;
247         }
248         else {
249             m_nextOffset_ = 0;
250             if (!returndelims) {
251                 m_nextOffset_ = getNextNonDelimiter(0);
252             }
253         }
254     }
255     
256     /**
257      * <p>Constructs a string tokenizer for the specified string. The
258      * characters in the delim argument are the delimiters for separating
259      * tokens.</p>
260      * <p>Delimiter characters themselves will not be treated as tokens.</p>
261      * @param str a string to be parsed.
262      * @param delim the delimiters.
263      * @exception throws a NullPointerException if str is null
264      * @stable ICU 2.4
265      */

266     public StringTokenizer(String JavaDoc str, String JavaDoc delim)
267     {
268         // don't ignore whitespace
269
this(str, delim, false, false);
270     }
271
272     /**
273      * <p>Constructs a string tokenizer for the specified string.
274      * The tokenizer uses the default delimiter set, which is
275      * " &#92;t&#92;n&#92;r&#92;f":
276      * the space character, the tab character, the newline character, the
277      * carriage-return character, and the form-feed character.</p>
278      * <p>Delimiter characters themselves will not be treated as tokens.</p>
279      * @param str a string to be parsed
280      * @exception throws a NullPointerException if str is null
281      * @stable ICU 2.4
282      */

283     public StringTokenizer(String JavaDoc str)
284     {
285         this(str, DEFAULT_DELIMITERS_, false, false);
286     }
287     
288     // public methods --------------------------------------------------
289

290     /**
291      * Tests if there are more tokens available from this tokenizer's
292      * string.
293      * If this method returns <tt>true</tt>, then a subsequent call to
294      * <tt>nextToken</tt> with no argument will successfully return a token.
295      * @return <code>true</code> if and only if there is at least one token
296      * in the string after the current position; <code>false</code>
297      * otherwise.
298      * @stable ICU 2.4
299      */

300     public boolean hasMoreTokens()
301     {
302         return m_nextOffset_ >= 0;
303     }
304     
305     /**
306      * Returns the next token from this string tokenizer.
307      * @return the next token from this string tokenizer.
308      * @exception NoSuchElementException if there are no more tokens in
309      * this tokenizer's string.
310      * @stable ICU 2.4
311      */

312     public String JavaDoc nextToken()
313     {
314         if (m_tokenOffset_ < 0) {
315             if (m_nextOffset_ < 0) {
316                 throw new NoSuchElementException JavaDoc("No more tokens in String");
317             }
318             // pre-calculations of tokens not done
319
if (m_returnDelimiters_) {
320                 int tokenlimit = 0;
321                 int c = UTF16.charAt(m_source_, m_nextOffset_);
322                 boolean contains = delims == null
323                     ? m_delimiters_.contains(c)
324                     : c < delims.length && delims[c];
325                 if (contains) {
326                      if (m_coalesceDelimiters_) {
327                         tokenlimit = getNextNonDelimiter(m_nextOffset_);
328                      } else {
329                         tokenlimit = m_nextOffset_ + UTF16.getCharCount(c);
330                         if (tokenlimit == m_length_) {
331                             tokenlimit = -1;
332                         }
333                      }
334                 }
335                 else {
336                     tokenlimit = getNextDelimiter(m_nextOffset_);
337                 }
338                 String JavaDoc result;
339                 if (tokenlimit < 0) {
340                     result = m_source_.substring(m_nextOffset_);
341                 }
342                 else {
343                     result = m_source_.substring(m_nextOffset_, tokenlimit);
344                 }
345                 m_nextOffset_ = tokenlimit;
346                 return result;
347             }
348             else {
349                 int tokenlimit = getNextDelimiter(m_nextOffset_);
350                 String JavaDoc result;
351                 if (tokenlimit < 0) {
352                     result = m_source_.substring(m_nextOffset_);
353                     m_nextOffset_ = tokenlimit;
354                 }
355                 else {
356                     result = m_source_.substring(m_nextOffset_, tokenlimit);
357                     m_nextOffset_ = getNextNonDelimiter(tokenlimit);
358                 }
359                 
360                 return result;
361             }
362         }
363         // count was called before and we have all the tokens
364
if (m_tokenOffset_ >= m_tokenSize_) {
365             throw new NoSuchElementException JavaDoc("No more tokens in String");
366         }
367         String JavaDoc result;
368         if (m_tokenLimit_[m_tokenOffset_] >= 0) {
369             result = m_source_.substring(m_tokenStart_[m_tokenOffset_],
370                                          m_tokenLimit_[m_tokenOffset_]);
371         }
372         else {
373             result = m_source_.substring(m_tokenStart_[m_tokenOffset_]);
374         }
375         m_tokenOffset_ ++;
376         m_nextOffset_ = -1;
377         if (m_tokenOffset_ < m_tokenSize_) {
378             m_nextOffset_ = m_tokenStart_[m_tokenOffset_];
379         }
380         return result;
381     }
382     
383     /**
384      * Returns the next token in this string tokenizer's string. First,
385      * the set of characters considered to be delimiters by this
386      * <tt>StringTokenizer</tt> object is changed to be the characters in
387      * the string <tt>delim</tt>. Then the next token in the string
388      * after the current position is returned. The current position is
389      * advanced beyond the recognized token. The new delimiter set
390      * remains the default after this call.
391      * @param delim the new delimiters.
392      * @return the next token, after switching to the new delimiter set.
393      * @exception NoSuchElementException if there are no more tokens in
394      * this tokenizer's string.
395      * @stable ICU 2.4
396      */

397     public String JavaDoc nextToken(String JavaDoc delim)
398     {
399         m_delimiters_ = EMPTY_DELIMITER_;
400         if (delim != null && delim.length() > 0) {
401             m_delimiters_ = new UnicodeSet();
402             m_delimiters_.addAll(delim);
403         }
404         return nextToken(m_delimiters_);
405     }
406     
407     /**
408      * Returns the next token in this string tokenizer's string. First,
409      * the set of characters considered to be delimiters by this
410      * <tt>StringTokenizer</tt> object is changed to be the characters in
411      * the string <tt>delim</tt>. Then the next token in the string
412      * after the current position is returned. The current position is
413      * advanced beyond the recognized token. The new delimiter set
414      * remains the default after this call.
415      * @param delim the new delimiters.
416      * @return the next token, after switching to the new delimiter set.
417      * @exception NoSuchElementException if there are no more tokens in
418      * this tokenizer's string.
419      * @stable ICU 2.4
420      */

421     public String JavaDoc nextToken(UnicodeSet delim)
422     {
423         m_delimiters_ = delim;
424         checkDelimiters();
425         m_tokenOffset_ = -1;
426         m_tokenSize_ = -1;
427         if (!m_returnDelimiters_) {
428             m_nextOffset_ = getNextNonDelimiter(m_nextOffset_);
429         }
430         return nextToken();
431     }
432     
433     /**
434      * Returns the same value as the <code>hasMoreTokens</code> method.
435      * It exists so that this class can implement the
436      * <code>Enumeration</code> interface.
437      * @return <code>true</code> if there are more tokens;
438      * <code>false</code> otherwise.
439      * @see #hasMoreTokens()
440      * @stable ICU 2.4
441      */

442     public boolean hasMoreElements()
443     {
444         return hasMoreTokens();
445     }
446     
447     /**
448      * Returns the same value as the <code>nextToken</code> method, except
449      * that its declared return value is <code>Object</code> rather than
450      * <code>String</code>. It exists so that this class can implement the
451      * <code>Enumeration</code> interface.
452      * @return the next token in the string.
453      * @exception NoSuchElementException if there are no more tokens in
454      * this tokenizer's string.
455      * @see #nextToken()
456      * @stable ICU 2.4
457      */

458     public Object JavaDoc nextElement()
459     {
460         return nextToken();
461     }
462     
463     /**
464      * Calculates the number of times that this tokenizer's
465      * <code>nextToken</code> method can be called before it generates an
466      * exception. The current position is not advanced.
467      * @return the number of tokens remaining in the string using the
468      * current delimiter set.
469      * @see #nextToken()
470      * @stable ICU 2.4
471      */

472     public int countTokens()
473     {
474         int result = 0;
475         if (hasMoreTokens()) {
476             if (m_tokenOffset_ >= 0) {
477                 return m_tokenSize_ - m_tokenOffset_;
478             }
479             if (m_tokenStart_ == null) {
480                 m_tokenStart_ = new int[TOKEN_SIZE_];
481                 m_tokenLimit_ = new int[TOKEN_SIZE_];
482             }
483             do {
484                 if (m_tokenStart_.length == result) {
485                     int temptokenindex[] = m_tokenStart_;
486                     int temptokensize[] = m_tokenLimit_;
487                     int originalsize = temptokenindex.length;
488                     int newsize = originalsize + TOKEN_SIZE_;
489                     m_tokenStart_ = new int[newsize];
490                     m_tokenLimit_ = new int[newsize];
491                     System.arraycopy(temptokenindex, 0, m_tokenStart_, 0,
492                                      originalsize);
493                     System.arraycopy(temptokensize, 0, m_tokenLimit_, 0,
494                                      originalsize);
495                 }
496                 m_tokenStart_[result] = m_nextOffset_;
497                 if (m_returnDelimiters_) {
498                     int c = UTF16.charAt(m_source_, m_nextOffset_);
499                     boolean contains = delims == null
500                         ? m_delimiters_.contains(c)
501                         : c < delims.length && delims[c];
502                     if (contains) {
503                         if (m_coalesceDelimiters_) {
504                             m_tokenLimit_[result] = getNextNonDelimiter(
505                                                                 m_nextOffset_);
506                         } else {
507                             int p = m_nextOffset_ + 1;
508                             if (p == m_length_) {
509                                 p = -1;
510                             }
511                             m_tokenLimit_[result] = p;
512
513                         }
514                     }
515                     else {
516                         m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
517                     }
518                     m_nextOffset_ = m_tokenLimit_[result];
519                 }
520                 else {
521                     m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
522                     m_nextOffset_ = getNextNonDelimiter(m_tokenLimit_[result]);
523                 }
524                 result ++;
525             } while (m_nextOffset_ >= 0);
526             m_tokenOffset_ = 0;
527             m_tokenSize_ = result;
528             m_nextOffset_ = m_tokenStart_[0];
529         }
530         return result;
531     }
532     
533     // private data members -------------------------------------------------
534

535     /**
536      * Current offset to the token array. If the array token is not set up yet,
537      * this value is a -1
538      */

539     private int m_tokenOffset_;
540     /**
541      * Size of the token array. If the array token is not set up yet,
542      * this value is a -1
543      */

544     private int m_tokenSize_;
545     /**
546      * Array of pre-calculated tokens start indexes in source string terminated
547      * by -1.
548      * This is only set up during countTokens() and only stores the remaining
549      * tokens, not all tokens including parsed ones
550      */

551     private int m_tokenStart_[];
552     /**
553      * Array of pre-calculated tokens limit indexes in source string.
554      * This is only set up during countTokens() and only stores the remaining
555      * tokens, not all tokens including parsed ones
556      */

557     private int m_tokenLimit_[];
558     /**
559      * UnicodeSet containing delimiters
560      */

561     private UnicodeSet m_delimiters_;
562     /**
563      * String to parse for tokens
564      */

565     private String JavaDoc m_source_;
566     /**
567      * Length of m_source_
568      */

569     private int m_length_;
570     /**
571      * Current position in string to parse for tokens
572      */

573     private int m_nextOffset_;
574     /**
575      * Flag indicator if delimiters are to be treated as tokens too
576      */

577     private boolean m_returnDelimiters_;
578
579     /**
580      * Flag indicating whether to coalesce runs of delimiters into single tokens
581      */

582     private boolean m_coalesceDelimiters_;
583
584     /**
585      * Default set of delimiters &#92;t&#92;n&#92;r&#92;f
586      */

587     private static final UnicodeSet DEFAULT_DELIMITERS_
588                                         = new UnicodeSet("[ \t\n\r\f]", false);
589     /**
590      * Array size increments
591      */

592     private static final int TOKEN_SIZE_ = 100;
593     /**
594      * A empty delimiter UnicodeSet, used when user specified null delimiters
595      */

596     private static final UnicodeSet EMPTY_DELIMITER_ = new UnicodeSet();
597     
598     // private methods ------------------------------------------------------
599

600     /**
601      * Gets the index of the next delimiter after offset
602      * @param offset to the source string
603      * @return offset of the immediate next delimiter, otherwise
604      * (- source string length - 1) if there
605      * are no more delimiters after m_nextOffset
606      */

607     private int getNextDelimiter(int offset)
608     {
609         if (offset >= 0) {
610             int result = offset;
611             int c = 0;
612             if (delims == null) {
613                 do {
614                     c = UTF16.charAt(m_source_, result);
615                     if (m_delimiters_.contains(c)) {
616                         break;
617                     }
618                     result ++;
619                 } while (result < m_length_);
620             } else {
621                 do {
622                     c = UTF16.charAt(m_source_, result);
623                     if (c < delims.length && delims[c]) {
624                         break;
625                     }
626                     result ++;
627                 } while (result < m_length_);
628             }
629             if (result < m_length_) {
630                 return result;
631             }
632         }
633         return -1 - m_length_;
634     }
635     
636     /**
637      * Gets the index of the next non-delimiter after m_nextOffset_
638      * @param offset to the source string
639      * @return offset of the immediate next non-delimiter, otherwise
640      * (- source string length - 1) if there
641      * are no more delimiters after m_nextOffset
642      */

643     private int getNextNonDelimiter(int offset)
644     {
645         if (offset >= 0) {
646             int result = offset;
647             int c = 0;
648             if (delims == null) {
649                 do {
650                     c = UTF16.charAt(m_source_, result);
651                     if (!m_delimiters_.contains(c)) {
652                         break;
653                     }
654                     result ++;
655                 } while (result < m_length_);
656             } else {
657                 do {
658                     c = UTF16.charAt(m_source_, result);
659                     if (!(c < delims.length && delims[c])) {
660                         break;
661                     }
662                     result ++;
663                 } while (result < m_length_);
664             }
665             if (result < m_length_) {
666                 return result;
667             }
668         }
669         return -1 - m_length_;
670     }
671
672     void checkDelimiters() {
673         if (m_delimiters_ == null || m_delimiters_.size() == 0) {
674             delims = new boolean[0];
675         } else {
676             int maxChar = m_delimiters_.getRangeEnd(m_delimiters_.getRangeCount()-1);
677             if (maxChar < 0x7f) {
678                 delims = new boolean[maxChar+1];
679                 for (int i = 0, ch; -1 != (ch = m_delimiters_.charAt(i)); ++i) {
680                     delims[ch] = true;
681                 }
682             } else {
683                 delims = null;
684             }
685         }
686     }
687     private boolean[] delims;
688 }
689
Popular Tags