BreakIterator


1   /*
2    * @(#)BreakIterator.java   1.35 03/12/19
3    *
4    * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
5    * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6    */
7   
8   /*
9    * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
10   * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
11   *
12   * The original version of this source code and documentation
13   * is copyrighted and owned by Taligent, Inc., a wholly-owned
14   * subsidiary of IBM. These materials are provided under terms
15   * of a License Agreement between Taligent and Sun. This technology
16   * is protected by multiple US and International patents.
17   *
18   * This notice and attribution to Taligent may not be removed.
19   * Taligent is a registered trademark of Taligent, Inc.
20   *
21   */
22  
23  package java.text;
24  
25  import java.util.Vector  ;
26  import java.util.Locale  ;
27  import java.util.ResourceBundle  ;
28  import java.util.MissingResourceException  ;
29  import sun.text.resources.LocaleData;
30  import java.text.CharacterIterator  ;
31  import java.text.StringCharacterIterator  ;
32  
33  import java.net.URL  ;
34  import java.io.InputStream  ;
35  import java.io.IOException  ;
36  
37  import java.lang.ref.SoftReference  ;
38  import java.security.AccessController  ;
39  import java.security.PrivilegedAction  ;
40  
41  /**
42   * The <code>BreakIterator</code> class implements methods for finding
43   * the location of boundaries in text. Instances of <code>BreakIterator</code>
44   * maintain a current position and scan over text
45   * returning the index of characters where boundaries occur.
46   * Internally, <code>BreakIterator</code> scans text using a
47   * <code>CharacterIterator</code>, and is thus able to scan text held
48   * by any object implementing that protocol. A <code>StringCharacterIterator</code>
49   * is used to scan <code>String</code> objects passed to <code>setText</code>.
50   *
51   * <p>
52   * You use the factory methods provided by this class to create
53   * instances of various types of break iterators. In particular,
54   * use <code>getWordIterator</code>, <code>getLineIterator</code>,
55   * <code>getSentenceIterator</code>, and <code>getCharacterIterator</code>
56   * to create <code>BreakIterator</code>s that perform
57   * word, line, sentence, and character boundary analysis respectively.
58   * A single <code>BreakIterator</code> can work only on one unit
59   * (word, line, sentence, and so on). You must use a different iterator
60   * for each unit boundary analysis you wish to perform.
61   *
62   * <p>
63   * Line boundary analysis determines where a text string can be
64   * broken when line-wrapping. The mechanism correctly handles
65   * punctuation and hyphenated words.
66   *
67   * <p>
68   * Sentence boundary analysis allows selection with correct interpretation
69   * of periods within numbers and abbreviations, and trailing punctuation
70   * marks such as quotation marks and parentheses.
71   *
72   * <p>
73   * Word boundary analysis is used by search and replace functions, as
74   * well as within text editing applications that allow the user to
75   * select words with a double click. Word selection provides correct
76   * interpretation of punctuation marks within and following
77   * words. Characters that are not part of a word, such as symbols
78   * or punctuation marks, have word-breaks on both sides.
79   *
80   * <p>
81   * Character boundary analysis allows users to interact with characters
82   * as they expect to, for example, when moving the cursor through a text
83   * string. Character boundary analysis provides correct navigation of
84   * through character strings, regardless of how the character is stored.
85   * For example, an accented character might be stored as a base character
86   * and a diacritical mark. What users consider to be a character can
87   * differ between languages.
88   *
89   * <p>
90   * <code>BreakIterator</code> is intended for use with natural
91   * languages only. Do not use this class to tokenize a programming language.
92   *
93   * <P>
94   * <strong>Examples</strong>:<P>
95   * Creating and using text boundaries
96   * <blockquote>
97   * <pre>
98   * public static void main(String args[]) {
99   *      if (args.length == 1) {
100  *          String stringToExamine = args[0];
101  *          //print each word in order
102  *          BreakIterator boundary = BreakIterator.getWordInstance();
103  *          boundary.setText(stringToExamine);
104  *          printEachForward(boundary, stringToExamine);
105  *          //print each sentence in reverse order
106  *          boundary = BreakIterator.getSentenceInstance(Locale.US);
107  *          boundary.setText(stringToExamine);
108  *          printEachBackward(boundary, stringToExamine);
109  *          printFirst(boundary, stringToExamine);
110  *          printLast(boundary, stringToExamine);
111  *      }
112  * }
113  * </pre>
114  * </blockquote>
115  *
116  * Print each element in order
117  * <blockquote>
118  * <pre>
119  * public static void printEachForward(BreakIterator boundary, String source) {
120  *     int start = boundary.first();
121  *     for (int end = boundary.next();
122  *          end != BreakIterator.DONE;
123  *          start = end, end = boundary.next()) {
124  *          System.out.println(source.substring(start,end));
125  *     }
126  * }
127  * </pre>
128  * </blockquote>
129  *
130  * Print each element in reverse order
131  * <blockquote>
132  * <pre>
133  * public static void printEachBackward(BreakIterator boundary, String source) {
134  *     int end = boundary.last();
135  *     for (int start = boundary.previous();
136  *          start != BreakIterator.DONE;
137  *          end = start, start = boundary.previous()) {
138  *         System.out.println(source.substring(start,end));
139  *     }
140  * }
141  * </pre>
142  * </blockquote>
143  *
144  * Print first element
145  * <blockquote>
146  * <pre>
147  * public static void printFirst(BreakIterator boundary, String source) {
148  *     int start = boundary.first();
149  *     int end = boundary.next();
150  *     System.out.println(source.substring(start,end));
151  * }
152  * </pre>
153  * </blockquote>
154  *
155  * Print last element
156  * <blockquote>
157  * <pre>
158  * public static void printLast(BreakIterator boundary, String source) {
159  *     int end = boundary.last();
160  *     int start = boundary.previous();
161  *     System.out.println(source.substring(start,end));
162  * }
163  * </pre>
164  * </blockquote>
165  *
166  * Print the element at a specified position
167  * <blockquote>
168  * <pre>
169  * public static void printAt(BreakIterator boundary, int pos, String source) {
170  *     int end = boundary.following(pos);
171  *     int start = boundary.previous();
172  *     System.out.println(source.substring(start,end));
173  * }
174  * </pre>
175  * </blockquote>
176  *
177  * Find the next word
178  * <blockquote>
179  * <pre>
180  * public static int nextWordStartAfter(int pos, String text) {
181  *     BreakIterator wb = BreakIterator.getWordInstance();
182  *     wb.setText(text);
183  *     int last = wb.following(pos);
184  *     int current = wb.next();
185  *     while (current != BreakIterator.DONE) {
186  *         for (int p = last; p < current; p++) {
187  *             if (Character.isLetter(text.codePointAt(p))
188  *                 return last;
189  *         }
190  *         last = current;
191  *         current = wb.next();
192  *     }
193  *     return BreakIterator.DONE;
194  * }
195  * </pre>
196  * (The iterator returned by BreakIterator.getWordInstance() is unique in that
197  * the break positions it returns don't represent both the start and end of the
198  * thing being iterated over.  That is, a sentence-break iterator returns breaks
199  * that each represent the end of one sentence and the beginning of the next.
200  * With the word-break iterator, the characters between two boundaries might be a
201  * word, or they might be the punctuation or whitespace between two words.  The
202  * above code uses a simple heuristic to determine which boundary is the beginning
203  * of a word: If the characters between this boundary and the next boundary
204  * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
205  * a Hangul syllable, a Kana character, etc.), then the text between this boundary
206  * and the next is a word; otherwise, it's the material between words.)
207  * </blockquote>
208  *
209  * @see CharacterIterator
210  *
211  */
212 
213 public abstract class BreakIterator implements Cloneable  
214 {
215     /**
216      * Constructor. BreakIterator is stateless and has no default behavior.
217      */
218     protected BreakIterator()
219     {
220     }
221 
222     /**
223      * Create a copy of this iterator
224      * @return A copy of this
225      */
226     public Object   clone()
227     {
228         try {
229             return super.clone();
230         }
231         catch (CloneNotSupportedException   e) {
232             throw new InternalError  ();
233         }
234     }
235 
236     /**
237      * DONE is returned by previous() and next() after all valid
238      * boundaries have been returned.
239      */
240     public static final int DONE = -1;
241 
242     /**
243      * Return the first boundary. The iterator's current position is set
244      * to the first boundary.
245      * @return The character index of the first text boundary.
246      */
247     public abstract int first();
248 
249     /**
250      * Return the last boundary. The iterator's current position is set
251      * to the last boundary.
252      * @return The character index of the last text boundary.
253      */
254     public abstract int last();
255 
256     /**
257      * Return the nth boundary from the current boundary
258      * @param n which boundary to return.  A value of 0
259      * does nothing.  Negative values move to previous boundaries
260      * and positive values move to later boundaries.
261      * @return The index of the nth boundary from the current position.
262      */
263     public abstract int next(int n);
264 
265     /**
266      * Return the boundary following the current boundary.
267      * @return The character index of the next text boundary or DONE if all
268      * boundaries have been returned.  Equivalent to next(1).
269      */
270     public abstract int next();
271 
272     /**
273      * Return the boundary preceding the current boundary.
274      * @return The character index of the previous text boundary or DONE if all
275      * boundaries have been returned.
276      */
277     public abstract int previous();
278 
279     /**
280      * Return the first boundary following the specified offset.
281      * The value returned is always greater than the offset or
282      * the value BreakIterator.DONE
283      * @param offset the offset to begin scanning. Valid values
284      * are determined by the CharacterIterator passed to
285      * setText().  Invalid values cause
286      * an IllegalArgumentException to be thrown.
287      * @return The first boundary after the specified offset.
288      */
289     public abstract int following(int offset);
290 
291     /**
292      * Return the last boundary preceding the specfied offset.
293      * The value returned is always less than the offset or the value
294      * BreakIterator.DONE.
295      * @param offset the offset to begin scanning.  Valid values are
296      * determined by the CharacterIterator passed to setText().
297      * Invalid values cause an IllegalArgumentException to be thrown.
298      * @return The last boundary before the specified offset.
299      * @since 1.2
300      */
301     public int preceding(int offset) {
302         // NOTE:  This implementation is here solely because we can't add new
303         // abstract methods to an existing class.  There is almost ALWAYS a
304         // better, faster way to do this.
305         int pos = following(offset);
306         while (pos >= offset && pos != DONE)
307             pos = previous();
308         return pos;
309     }
310 
311     /**
312      * Return true if the specified position is a boundary position.
313      * @param offset the offset to check.
314      * @return True if "offset" is a boundary position.
315      * @since 1.2
316      */
317     public boolean isBoundary(int offset) {
318         // NOTE: This implementation probably is wrong for most situations
319         // because it fails to take into account the possibility that a
320         // CharacterIterator passed to setText() may not have a begin offset
321         // of 0.  But since the abstract BreakIterator doesn't have that
322         // knowledge, it assumes the begin offset is 0.  If you subclass
323         // BreakIterator, copy the SimpleTextBoundary implementation of this
324         // function into your subclass.  [This should have been abstract at
325         // this level, but it's too late to fix that now.]
326         if (offset == 0)
327             return true;
328         else
329             return following(offset - 1) == offset;
330     }
331 
332     /**
333      * Return character index of the text boundary that was most recently
334      * returned by next(), previous(), first(), or last()
335      * @return The boundary most recently returned.
336      */
337     public abstract int current();
338 
339     /**
340      * Get the text being scanned
341      * @return the text being scanned
342      */
343     public abstract CharacterIterator   getText();
344 
345     /**
346      * Set a new text string to be scanned.  The current scan
347      * position is reset to first().
348      * @param newText new text to scan.
349      */
350     public void setText(String   newText)
351     {
352         setText(new StringCharacterIterator  (newText));
353     }
354 
355     /**
356      * Set a new text for scanning.  The current scan
357      * position is reset to first().
358      * @param newText new text to scan.
359      */
360     public abstract void setText(CharacterIterator   newText);
361 
362     private static final int CHARACTER_INDEX = 0;
363     private static final int WORD_INDEX = 1;
364     private static final int LINE_INDEX = 2;
365     private static final int SENTENCE_INDEX = 3;
366     private static final SoftReference  [] iterCache = new SoftReference  [4];
367 
368     /**
369      * Create BreakIterator for word-breaks using default locale.
370      * Returns an instance of a BreakIterator implementing word breaks.
371      * WordBreak  is usefull for word selection (ex. double click)
372      * @return A BreakIterator for word-breaks
373      * @see java.util.Locale#getDefault
374      */
375     public static BreakIterator   getWordInstance()
376     {
377         return getWordInstance(Locale.getDefault());
378     }
379 
380     /**
381      * Create BreakIterator for word-breaks using specified locale.
382      * Returns an instance of a BreakIterator implementing word breaks.
383      * WordBreak is usefull for word selection (ex. double click)
384      * @param where the local.  If a specific WordBreak is not
385      * avaliable for the specified locale, a default WordBreak is returned.
386      * @return A BreakIterator for word-breaks
387      */
388     public static BreakIterator   getWordInstance(Locale   where)
389     {
390         return getBreakInstance(where,
391                                 WORD_INDEX,
392                                 "WordData",
393                                 "WordDictionary");
394     }
395 
396     /**
397      * Create BreakIterator for line-breaks using default locale.
398      * Returns an instance of a BreakIterator implementing line breaks. Line
399      * breaks are logically possible line breaks, actual line breaks are
400      * usually determined based on display width.
401      * LineBreak is useful for word wrapping text.
402      * @return A BreakIterator for line-breaks
403      * @see java.util.Locale#getDefault
404      */
405     public static BreakIterator   getLineInstance()
406     {
407         return getLineInstance(Locale.getDefault());
408     }
409 
410     /**
411      * Create BreakIterator for line-breaks using specified locale.
412      * Returns an instance of a BreakIterator implementing line breaks. Line
413      * breaks are logically possible line breaks, actual line breaks are
414      * usually determined based on display width.
415      * LineBreak is useful for word wrapping text.
416      * @param where the local.  If a specific LineBreak is not
417      * avaliable for the specified locale, a default LineBreak is returned.
418      * @return A BreakIterator for line-breaks
419      */
420     public static BreakIterator   getLineInstance(Locale   where)
421     {
422         return getBreakInstance(where,
423                                 LINE_INDEX,
424                                 "LineData",
425                                 "LineDictionary");
426     }
427 
428     /**
429      * Create BreakIterator for character-breaks using default locale
430      * Returns an instance of a BreakIterator implementing character breaks.
431      * Character breaks are boundaries of combining character sequences.
432      * @return A BreakIterator for character-breaks
433      * @see Locale#getDefault
434      */
435     public static BreakIterator   getCharacterInstance()
436     {
437         return getCharacterInstance(Locale.getDefault());
438     }
439 
440     /**
441      * Create BreakIterator for character-breaks using specified locale
442      * Returns an instance of a BreakIterator implementing character breaks.
443      * Character breaks are boundaries of combining character sequences.
444      * @param where the local.  If a specific character break is not
445      * avaliable for the specified local, a default character break is returned.
446      * @return A BreakIterator for character-breaks
447      */
448     public static BreakIterator   getCharacterInstance(Locale   where)
449     {
450         return getBreakInstance(where,
451                                 CHARACTER_INDEX,
452                                 "CharacterData",
453                                 "CharacterDictionary");
454     }
455 
456     /**
457      * Create BreakIterator for sentence-breaks using default locale
458      * Returns an instance of a BreakIterator implementing sentence breaks.
459      * @return A BreakIterator for sentence-breaks
460      * @see java.util.Locale#getDefault
461      */
462     public static BreakIterator   getSentenceInstance()
463     {
464         return getSentenceInstance(Locale.getDefault());
465     }
466 
467     /**
468      * Create BreakIterator for sentence-breaks using specified locale
469      * Returns an instance of a BreakIterator implementing sentence breaks.
470      * @param where the local.  If a specific SentenceBreak is not
471      * avaliable for the specified local, a default SentenceBreak is returned.
472      * @return A BreakIterator for sentence-breaks
473      */
474     public static BreakIterator   getSentenceInstance(Locale   where)
475     {
476         return getBreakInstance(where,
477                                 SENTENCE_INDEX,
478                                 "SentenceData",
479                                 "SentenceDictionary");
480     }
481 
482     private static BreakIterator   getBreakInstance(Locale   where,
483                                                   int type,
484                                                   String   dataName,
485                                                   String   dictionaryName) {
486         if (iterCache[type] != null) {
487             BreakIteratorCache cache = (BreakIteratorCache) iterCache[type].get();
488             if (cache != null) {
489                 if (cache.getLocale().equals(where)) {
490                     return cache.createBreakInstance();
491                 }
492             }
493         }
494 
495         BreakIterator   result = createBreakInstance(where,
496                                                    type,
497                                                    dataName,
498                                                    dictionaryName);
499         BreakIteratorCache cache = new BreakIteratorCache(where, result);
500         iterCache[type] = new SoftReference  (cache);
501         return result;
502     }
503 
504     private static ResourceBundle   getBundle(final String   baseName, final Locale   locale) {
505          return (ResourceBundle  ) AccessController.doPrivileged(new PrivilegedAction  () {
506             public Object   run() {
507                 return ResourceBundle.getBundle(baseName, locale);
508             }
509         });
510     }
511 
512     private static BreakIterator   createBreakInstance(Locale   where,
513                                                      int type,
514                                                      String   dataName,
515                                                      String   dictionaryName) {
516 
517         ResourceBundle   bundle = getBundle(
518                         "sun.text.resources.BreakIteratorInfo", where);
519         String  [] classNames = bundle.getStringArray("BreakIteratorClasses");
520 
521         String   dataFile = bundle.getString(dataName);
522 
523         try {
524             if (classNames[type].equals("RuleBasedBreakIterator")) {
525                 return new RuleBasedBreakIterator  (dataFile);
526             }
527             else if (classNames[type].equals("DictionaryBasedBreakIterator")) {
528                 String   dictionaryFile = bundle.getString(dictionaryName);
529                 return new DictionaryBasedBreakIterator  (dataFile, dictionaryFile);
530             }
531             else {
532                 throw new IllegalArgumentException  ("Invalid break iterator class \"" +
533                                 classNames[type] + "\"");
534             }
535         }
536         catch (Exception   e) {
537             throw new InternalError  (e.toString()); 
538         }
539     }
540 
541     /**
542      * Returns an array of all locales for which the
543      * <code>get*Instance</code> methods of this class can return
544      * localized instances.
545      * The array returned must contain at least a <code>Locale</code>
546      * instance equal to {@link java.util.Locale#US Locale.US}.
547      *
548      * @return An array of locales for which localized
549      *         <code>BreakIterator</code> instances are available.
550      */
551     public static synchronized Locale  [] getAvailableLocales()
552     {
553         //FIX ME - this is a known bug.  It should return
554         //all locales.
555         return LocaleData.getAvailableLocales("NumberPatterns");
556     }
557 
558     private static final class BreakIteratorCache {
559 
560         private BreakIterator   iter;
561         private Locale   where;
562 
563         BreakIteratorCache(Locale   where, BreakIterator   iter) {
564             this.where = where;
565             this.iter = (BreakIterator  ) iter.clone();
566         }
567 
568         Locale   getLocale() {
569             return where;
570         }
571 
572         BreakIterator   createBreakInstance() {
573             return (BreakIterator  ) iter.clone();
574         }
575     }
576 
577     protected static long getLong(byte[] buf, int offset) {
578         long num = buf[offset]&0xFF;
579         for (int i = 1; i < 8; i++) {
580             num = num<<8 | (buf[offset+i]&0xFF);
581         }
582         return num;
583     }
584 
585     protected static int getInt(byte[] buf, int offset) {
586         int num = buf[offset]&0xFF;
587         for (int i = 1; i < 4; i++) {
588             num = num<<8 | (buf[offset+i]&0xFF);
589         }
590         return num;
591     }
592 
593     protected static short getShort(byte[] buf, int offset) {
594         short num = (short)(buf[offset]&0xFF);
595         num = (short)(num<<8 | (buf[offset+1]&0xFF));
596         return num;
597     }
598 }
599
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags