KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > RuleBasedBreakIterator


1 //##header 1189099963000 FOUNDATION
2
/*
3  *******************************************************************************
4  * Copyright (C) 2005-2006 International Business Machines Corporation and *
5  * others. All Rights Reserved. *
6  *******************************************************************************
7  */

8 package com.ibm.icu.text;
9
10 import java.text.CharacterIterator JavaDoc;
11 import java.io.IOException JavaDoc;
12 import java.io.InputStream JavaDoc;
13 import java.io.OutputStream JavaDoc;
14 import java.io.ByteArrayInputStream JavaDoc;
15 import java.io.ByteArrayOutputStream JavaDoc;
16
17 import com.ibm.icu.impl.Assert;
18
19
20 /**
21  * Rule Based Break Iterator
22  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
23  *
24  * @stable ICU 2.0
25  */

26 public class RuleBasedBreakIterator extends BreakIterator {
27
28     
29     //=======================================================================
30
// Constructors & Factories
31
//=======================================================================
32

33     /**
34      * @internal
35      * @deprecated This API is ICU internal only.
36      */

37     public RuleBasedBreakIterator() {
38     }
39
40     /**
41      * Create a break iterator from a precompiled set of rules.
42      * @internal
43      * @deprecated This API is ICU internal only.
44      */

45     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream JavaDoc is) throws IOException JavaDoc {
46         RuleBasedBreakIterator This = new RuleBasedBreakIterator();
47         This.fRData = RBBIDataWrapper.get(is);
48         return This;
49     }
50     
51     private RuleBasedBreakIterator(RuleBasedBreakIterator other) {
52         // TODO: check types.
53
fRData = other.fRData;
54         if (fText != null) {
55             fText = (CharacterIterator JavaDoc)(other.fText.clone());
56         }
57     }
58
59     /**
60      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
61      * @param rules The break rules to be used.
62      * @param parseError In the event of a syntax error in the rules, provides the location
63      * within the rules of the problem.
64      * @param status Information on any errors encountered.
65      * @stable ICU 2.2
66      */

67     public RuleBasedBreakIterator(String JavaDoc rules) {
68         init();
69         try {
70             ByteArrayOutputStream JavaDoc ruleOS = new ByteArrayOutputStream JavaDoc();
71             compileRules(rules, ruleOS);
72             byte [] ruleBA = ruleOS.toByteArray();
73             InputStream JavaDoc ruleIS = new ByteArrayInputStream JavaDoc(ruleBA);
74             fRData = RBBIDataWrapper.get(ruleIS);
75         } catch (IOException JavaDoc e) {
76             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
77
// causing bogus compiled rules to be produced, but with no compile error raised.
78
//#ifdef FOUNDATION
79
RuntimeException JavaDoc rte = new RuntimeException JavaDoc("RuleBasedBreakIterator rule compilation internal error:");
80 //#else
81
//## RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error:", e);
82
//#endif
83
throw rte;
84         }
85     }
86     
87     
88     //=======================================================================
89
// Boilerplate
90
//=======================================================================
91

92     /**
93      * Clones this iterator.
94      * @return A newly-constructed RuleBasedBreakIterator with the same
95      * behavior as this one.
96      * @stable ICU 2.0
97      */

98     public Object JavaDoc clone()
99     {
100         RuleBasedBreakIterator result = new RuleBasedBreakIterator(this);
101         return result;
102     }
103
104     /**
105      * Returns true if both BreakIterators are of the same class, have the same
106      * rules, and iterate over the same text.
107      * @stable ICU 2.0
108      */

109     public boolean equals(Object JavaDoc that) {
110         try {
111             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
112             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
113                 return false;
114             }
115             if (fRData != null && other.fRData != null &&
116                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
117                 return false;
118             }
119             if (fText == null && other.fText == null) {
120                 return true;
121             }
122             if (fText == null || other.fText == null) {
123                 return false;
124             }
125             return fText.equals(other.fText);
126         }
127         catch(ClassCastException JavaDoc e) {
128             return false;
129         }
130      }
131
132     /**
133      * Returns the description (rules) used to create this iterator.
134      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
135      * @stable ICU 2.0
136      */

137     public String JavaDoc toString() {
138         String JavaDoc retStr = null;
139         if (fRData != null) {
140             retStr = fRData.fRuleSource;
141         }
142         return retStr;
143     }
144
145     /**
146      * Compute a hashcode for this BreakIterator
147      * @return A hash code
148      * @stable ICU 2.0
149      */

150     public int hashCode()
151     {
152         return fRData.fRuleSource.hashCode();
153     }
154
155     
156     /**
157      * Tag value for "words" that do not fit into any of other categories.
158      * Includes spaces and most punctuation.
159      * @draft ICU 3.0
160      * @provisional This is a draft API and might change in a future release of ICU.
161      */

162     public static final int WORD_NONE = 0;
163
164     /**
165      * Upper bound for tags for uncategorized words.
166      * @draft ICU 3.0
167      * @provisional This is a draft API and might change in a future release of ICU.
168      */

169     public static final int WORD_NONE_LIMIT = 100;
170
171     /**
172      * Tag value for words that appear to be numbers, lower limit.
173      * @draft ICU 3.0
174      * @provisional This is a draft API and might change in a future release of ICU.
175      */

176     public static final int WORD_NUMBER = 100;
177
178     /**
179      * Tag value for words that appear to be numbers, upper limit.
180      * @draft ICU 3.0
181      * @provisional This is a draft API and might change in a future release of ICU.
182      */

183     public static final int WORD_NUMBER_LIMIT = 200;
184
185     /**
186      * Tag value for words that contain letters, excluding
187      * hiragana, katakana or ideographic characters, lower limit.
188      * @draft ICU 3.0
189      * @provisional This is a draft API and might change in a future release of ICU.
190      */

191     public static final int WORD_LETTER = 200;
192
193     /**
194      * Tag value for words containing letters, upper limit
195      * @draft ICU 3.0
196      * @provisional This is a draft API and might change in a future release of ICU.
197      */

198     public static final int WORD_LETTER_LIMIT = 300;
199
200     /**
201      * Tag value for words containing kana characters, lower limit
202      * @draft ICU 3.0
203      * @provisional This is a draft API and might change in a future release of ICU.
204      */

205     public static final int WORD_KANA = 300;
206
207     /**
208      * Tag value for words containing kana characters, upper limit
209      * @draft ICU 3.0
210      * @provisional This is a draft API and might change in a future release of ICU.
211      */

212     public static final int WORD_KANA_LIMIT = 400;
213
214     /**
215      * Tag value for words containing ideographic characters, lower limit
216      * @draft ICU 3.0
217      * @provisional This is a draft API and might change in a future release of ICU.
218      */

219     public static final int WORD_IDEO = 400;
220
221     /**
222      * Tag value for words containing ideographic characters, upper limit
223      * @draft ICU 3.0
224      * @provisional This is a draft API and might change in a future release of ICU.
225      */

226     public static final int WORD_IDEO_LIMIT = 500;
227
228    
229     
230     
231     private static final int START_STATE = 1; // The state number of the starting state
232
private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
233

234     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
235
// of user text. A variable with this enum type keeps track of where we
236
// are. The state machine only fetches user text input while in RUN mode.
237
private static final int RBBI_START = 0;
238     private static final int RBBI_RUN = 1;
239     private static final int RBBI_END = 2;
240
241     /**
242      * The character iterator through which this BreakIterator accesses the text.
243      *
244      * @internal
245      * @deprecated This API is ICU internal only.
246      */

247     private CharacterIterator JavaDoc fText = new java.text.StringCharacterIterator JavaDoc("");
248     
249     /**
250      * The rule data for this BreakIterator instance
251      * @internal
252      * @deprecated This API is ICU internal only.
253      */

254     protected RBBIDataWrapper fRData;
255     
256     /** Index of the Rule {tag} values for the most recent match.
257      * @internal
258      * @deprecated This API is ICU internal only.
259      */

260     private int fLastRuleStatusIndex;
261
262     /**
263      * Rule tag value valid flag.
264      * Some iterator operations don't intrinsically set the correct tag value.
265      * This flag lets us lazily compute the value if we are ever asked for it.
266      * @internal
267      * @deprecated This API is ICU internal only.
268      */

269     private boolean fLastStatusIndexValid;
270     
271     /**
272      * Counter for the number of characters encountered with the "dictionary"
273      * flag set. Normal RBBI iterators don't use it, although the code
274      * for updating it is live. Dictionary Based break iterators (a subclass
275      * of us) access this field directly.
276      * @internal
277      * @deprecated This API is ICU internal only.
278      */

279      protected int fDictionaryCharCount;
280     
281     /**
282      * Debugging flag. Trace operation of state machine when true.
283      * @internal
284      * @deprecated This API is ICU internal only.
285      */

286     public static boolean fTrace;
287     
288     
289     /**
290      * Dump the contents of the state table and character classes for this break iterator.
291      * For debugging only.
292      * @internal
293      * @deprecated This API is ICU internal only.
294      */

295     public void dump() {
296         this.fRData.dump();
297     }
298
299     private static boolean debugInitDone = false;
300     
301     private void init() {
302         fLastStatusIndexValid = true;
303         fDictionaryCharCount = 0;
304
305  
306         if (debugInitDone == false) {
307             String JavaDoc debugEnv = System.getProperty("U_RBBIDEBUG");
308             if (debugEnv!=null && debugEnv.indexOf("trace")>=0) {
309                 fTrace = true;
310             }
311             debugInitDone = true;
312         }
313     }
314
315     private static void compileRules(String JavaDoc rules, OutputStream JavaDoc ruleBinary) throws IOException JavaDoc {
316         RBBIRuleBuilder.compileRules(rules, ruleBinary);
317     }
318     
319     //=======================================================================
320
// BreakIterator overrides
321
//=======================================================================
322

323     /**
324      * Sets the current iteration position to the beginning of the text.
325      * (i.e., the CharacterIterator's starting offset).
326      * @return The offset of the beginning of the text.
327      * @stable ICU 2.0
328      */

329     public int first() {
330         fLastRuleStatusIndex = 0;
331         fLastStatusIndexValid = true;
332         if (fText == null) {
333             return BreakIterator.DONE;
334         }
335         fText.first();
336         return fText.getIndex();
337     }
338     
339     
340     /**
341      * Sets the current iteration position to the end of the text.
342      * (i.e., the CharacterIterator's ending offset).
343      * @return The text's past-the-end offset.
344      * @stable ICU 2.0
345      */

346     public int last() {
347         if (fText == null) {
348             fLastRuleStatusIndex = 0;
349             fLastStatusIndexValid = true;
350             return BreakIterator.DONE;
351         }
352
353         // I'm not sure why, but t.last() returns the offset of the last character,
354
// rather than the past-the-end offset
355
//
356
// (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
357
// will work correctly.)
358

359
360         fLastStatusIndexValid = false;
361         int pos = fText.getEndIndex();
362         fText.setIndex(pos);
363         return pos;
364     }
365     
366     
367     /**
368      * Advances the iterator either forward or backward the specified number of steps.
369      * Negative values move backward, and positive values move forward. This is
370      * equivalent to repeatedly calling next() or previous().
371      * @param n The number of steps to move. The sign indicates the direction
372      * (negative is backwards, and positive is forwards).
373      * @return The character offset of the boundary position n boundaries away from
374      * the current one.
375      * @stable ICU 2.0
376      */

377     public int next(int n) {
378         int result = current();
379         while (n > 0) {
380             result = handleNext();
381             --n;
382         }
383         while (n < 0) {
384             result = previous();
385             ++n;
386         }
387         return result;
388     }
389     
390     
391     /**
392      * Advances the iterator to the next boundary position.
393      * @return The position of the first boundary after this one.
394      * @stable ICU 2.0
395      */

396     public int next() {
397         return handleNext();
398     }
399     
400     
401     /**
402      * Moves the iterator backwards, to the last boundary preceding this one.
403      * @return The position of the last boundary position preceding this one.
404      * @stable ICU 2.0
405      */

406     public int previous() {
407         // if we're already sitting at the beginning of the text, return DONE
408
if (fText == null || current() == fText.getBeginIndex()) {
409             fLastRuleStatusIndex = 0;
410             fLastStatusIndexValid = true;
411             return BreakIterator.DONE;
412         }
413
414         if (fRData.fSRTable != null || fRData.fSFTable != null) {
415             return handlePrevious(fRData.fRTable);
416         }
417
418         // old rule syntax
419
// set things up. handlePrevious() will back us up to some valid
420
// break position before the current position (we back our internal
421
// iterator up one step to prevent handlePrevious() from returning
422
// the current position), but not necessarily the last one before
423
// where we started
424

425         int start = current();
426
427         CIPrevious32(fText);
428         int lastResult = handlePrevious(fRData.fRTable);
429         if (lastResult == BreakIterator.DONE) {
430             lastResult = fText.getBeginIndex();
431             fText.setIndex(lastResult);
432         }
433         int result = lastResult;
434         int lastTag = 0;
435         boolean breakTagValid = false;
436
437         // iterate forward from the known break position until we pass our
438
// starting point. The last break position before the starting
439
// point is our return value
440

441         for (;;) {
442             result = handleNext();
443             if (result == BreakIterator.DONE || result >= start) {
444                 break;
445             }
446             lastResult = result;
447             lastTag = fLastRuleStatusIndex;
448             breakTagValid = true;
449         }
450
451         // fLastBreakTag wants to have the value for section of text preceding
452
// the result position that we are to return (in lastResult.) If
453
// the backwards rules overshot and the above loop had to do two or more
454
// handleNext()s to move up to the desired return position, we will have a valid
455
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
456
// we wont have a tag value for that position, which is only set by handleNext().
457

458         // set the current iteration position to be the last break position
459
// before where we started, and then return that value
460
fText.setIndex(lastResult);
461         fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
462
fLastStatusIndexValid = breakTagValid;
463         return lastResult;
464     }
465     /**
466      * Sets the iterator to refer to the first boundary position following
467      * the specified position.
468      * @param offset The position from which to begin searching for a break position.
469      * @return The position of the first break after the current position.
470      * @stable ICU 2.0
471      */

472     public int following(int offset) {
473         // if the offset passed in is already past the end of the text,
474
// just return DONE; if it's before the beginning, return the
475
// text's starting offset
476
fLastRuleStatusIndex = 0;
477         fLastStatusIndexValid = true;
478         if (fText == null || offset >= fText.getEndIndex()) {
479             last();
480             return next();
481         }
482         else if (offset < fText.getBeginIndex()) {
483             return first();
484         }
485
486         // otherwise, set our internal iteration position (temporarily)
487
// to the position passed in. If this is the _beginning_ position,
488
// then we can just use next() to get our return value
489

490         int result = 0;
491
492         if (fRData.fSRTable != null) {
493             // Safe Point Reverse rules exist.
494
// This allows us to use the optimum algorithm.
495
fText.setIndex(offset);
496             // move forward one codepoint to prepare for moving back to a
497
// safe point.
498
// this handles offset being between a supplementary character
499
CINext32(fText);
500             // handlePrevious will move most of the time to < 1 boundary away
501
handlePrevious(fRData.fSRTable);
502             result = next();
503             while (result <= offset) {
504                 result = next();
505             }
506             return result;
507         }
508         if (fRData.fSFTable != null) {
509             // No Safe point reverse table, but there is a safe pt forward table.
510
//
511
fText.setIndex(offset);
512             CIPrevious32(fText);
513             // handle next will give result >= offset
514
handleNext(fRData.fSFTable);
515             // previous will give result 0 or 1 boundary away from offset,
516
// most of the time
517
// we have to
518
int oldresult = previous();
519             while (oldresult > offset) {
520                 result = previous();
521                 if (result <= offset) {
522                     return oldresult;
523                 }
524                 oldresult = result;
525             }
526             result = next();
527             if (result <= offset) {
528                 return next();
529             }
530             return result;
531         }
532         // otherwise, we have to sync up first. Use handlePrevious() to back
533
// us up to a known break position before the specified position (if
534
// we can determine that the specified position is a break position,
535
// we don't back up at all). This may or may not be the last break
536
// position at or before our starting position. Advance forward
537
// from here until we've passed the starting position. The position
538
// we stop on will be the first break position after the specified one.
539
// old rule syntax
540

541         fText.setIndex(offset);
542         if (offset == fText.getBeginIndex()) {
543             return handleNext();
544         }
545         result = previous();
546
547         while (result != BreakIterator.DONE && result <= offset) {
548             result = next();
549         }
550
551         return result;
552     }
553     /**
554      * Sets the iterator to refer to the last boundary position before the
555      * specified position.
556      * @param offset The position to begin searching for a break from.
557      * @return The position of the last boundary before the starting position.
558      * @stable ICU 2.0
559      */

560     public int preceding(int offset) {
561         // if the offset passed in is already past the end of the text,
562
// just return DONE; if it's before the beginning, return the
563

564         // text's starting offset
565
if (fText == null || offset > fText.getEndIndex()) {
566             // return BreakIterator::DONE;
567
return last();
568         }
569         else if (offset < fText.getBeginIndex()) {
570             return first();
571         }
572
573         // if we start by updating the current iteration position to the
574
// position specified by the caller, we can just use previous()
575
// to carry out this operation
576

577         int result;
578         if (fRData.fSFTable != null) {
579             /// todo synwee
580
// new rule syntax
581
fText.setIndex(offset);
582             // move backwards one codepoint to prepare for moving forwards to a
583
// safe point.
584
// this handles offset being between a supplementary character
585
CIPrevious32(fText);
586             handleNext(fRData.fSFTable);
587             result = previous();
588             while (result >= offset) {
589                 result = previous();
590             }
591             return result;
592         }
593         if (fRData.fSRTable != null) {
594             // backup plan if forward safe table is not available
595
fText.setIndex(offset);
596             CINext32(fText);
597             // handle previous will give result <= offset
598
handlePrevious(fRData.fSRTable);
599
600             // next will give result 0 or 1 boundary away from offset,
601
// most of the time
602
// we have to
603
int oldresult = next();
604             while (oldresult < offset) {
605                 result = next();
606                 if (result >= offset) {
607                     return oldresult;
608                 }
609                 oldresult = result;
610             }
611             result = previous();
612             if (result >= offset) {
613                 return previous();
614             }
615             return result;
616         }
617
618         // old rule syntax
619
fText.setIndex(offset);
620         return previous();
621     }
622
623     /**
624      * Throw IllegalArgumentException unless begin <= offset < end.
625      * @stable ICU 2.0
626      */

627     protected static final void checkOffset(int offset, CharacterIterator JavaDoc text) {
628         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
629             throw new IllegalArgumentException JavaDoc("offset out of bounds");
630         }
631     }
632
633
634 /**
635  * Returns true if the specfied position is a boundary position. As a side
636  * effect, leaves the iterator pointing to the first boundary position at
637  * or after "offset".
638  * @param offset the offset to check.
639  * @return True if "offset" is a boundary position.
640  * @stable ICU 2.0
641  */

642 public boolean isBoundary(int offset) {
643     checkOffset(offset, fText);
644     
645     // the beginning index of the iterator is always a boundary position by definition
646
if (offset == fText.getBeginIndex()) {
647         first(); // For side effects on current position, tag values.
648
return true;
649     }
650
651     if (offset == fText.getEndIndex()) {
652         last(); // For side effects on current position, tag values.
653
return true;
654     }
655
656     // otherwise, we can use following() on the position before the specified
657
// one and return true if the position we get back is the one the user
658
// specified
659

660     // return following(offset - 1) == offset;
661
// TODO: check whether it is safe to revert to the simpler offset-1 code
662
// The safe rules may take care of unpaired surrogates ok.
663
fText.setIndex(offset);
664     CIPrevious32(fText);
665     int pos = fText.getIndex();
666     boolean result = following(pos) == offset;
667     return result;
668 }
669
670 /**
671  * Returns the current iteration position.
672  * @return The current iteration position.
673  * @stable ICU 2.0
674  */

675 public int current() {
676     return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
677     }
678
679
680
681 private void makeRuleStatusValid() {
682     if (fLastStatusIndexValid == false) {
683         // No cached status is available.
684
if (fText == null || current() == fText.getBeginIndex()) {
685             // At start of text, or there is no text. Status is always zero.
686
fLastRuleStatusIndex = 0;
687             fLastStatusIndexValid = true;
688         } else {
689             // Not at start of text. Find status the tedious way.
690
int pa = current();
691             previous();
692             int pb = next();
693             Assert.assrt (pa == pb);
694         }
695         Assert.assrt(fLastStatusIndexValid == true);
696         Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length);
697     }
698 }
699
700
701 /**
702  * Return the status tag from the break rule that determined the most recently
703  * returned break position. The values appear in the rule source
704  * within brackets, {123}, for example. For rules that do not specify a
705  * status, a default value of 0 is returned. If more than one rule applies,
706  * the numerically largest of the possible status values is returned.
707  * <p>
708  * Of the standard types of ICU break iterators, only the word break
709  * iterator provides status values. The values are defined in
710  * class RuleBasedBreakIterator, and allow distinguishing between words
711  * that contain alphabetic letters, "words" that appear to be numbers,
712  * punctuation and spaces, words containing ideographic characters, and
713  * more. Call <code>getRuleStatus</code> after obtaining a boundary
714  * position from <code>next()<code>, <code>previous()</code>, or
715  * any other break iterator functions that returns a boundary position.
716  * <p>
717  * @return the status from the break rule that determined the most recently
718  * returned break position.
719  *
720  * @draft ICU 3.0
721  * @provisional This is a draft API and might change in a future release of ICU.
722  */

723
724 public int getRuleStatus() {
725     makeRuleStatusValid();
726     // Status records have this form:
727
// Count N <-- fLastRuleStatusIndex points here.
728
// Status val 0
729
// Status val 1
730
// ...
731
// Status val N-1 <-- the value we need to return
732
// The status values are sorted in ascending order.
733
// This function returns the last (largest) of the array of status values.
734
int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
735     int tagVal = fRData.fStatusTable[idx];
736
737     return tagVal;
738 }
739
740
741
742 /**
743  * Get the status (tag) values from the break rule(s) that determined the most
744  * recently returned break position. The values appear in the rule source
745  * within brackets, {123}, for example. The default status value for rules
746  * that do not explicitly provide one is zero.
747  * <p>
748  * The status values used by the standard ICU break rules are defined
749  * as public constants in class RuleBasedBreakIterator.
750  * <p>
751  * If the size of the output array is insufficient to hold the data,
752  * the output will be truncated to the available length. No exception
753  * will be thrown.
754  *
755  * @param fillInArray an array to be filled in with the status values.
756  * @return The number of rule status values from rules that determined
757  * the most recent boundary returned by the break iterator.
758  * In the event that the array is too small, the return value
759  * is the total number of status values that were available,
760  * not the reduced number that were actually returned.
761  * @draft ICU 3.0
762  * @provisional This is a draft API and might change in a future release of ICU.
763  */

764 public int getRuleStatusVec(int[] fillInArray) {
765     makeRuleStatusValid();
766     int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
767     if (fillInArray != null) {
768         int numToCopy = Math.min(numStatusVals, fillInArray.length);
769         for (int i=0; i<numToCopy; i++) {
770             fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
771         }
772     }
773     return numStatusVals;
774  }
775
776
777 /**
778  * Return a CharacterIterator over the text being analyzed. This version
779  * of this method returns the actual CharacterIterator we're using internally.
780  * Changing the state of this iterator can have undefined consequences. If
781  * you need to change it, clone it first.
782  * @return An iterator over the text being analyzed.
783  * @stable ICU 2.0
784  */

785     public CharacterIterator JavaDoc getText() {
786         return fText;
787     }
788
789
790     /**
791      * Set the iterator to analyze a new piece of text. This function resets
792      * the current iteration position to the beginning of the text.
793      * @param newText An iterator over the text to analyze.
794      * @stable ICU 2.0
795      */

796     public void setText(CharacterIterator JavaDoc newText) {
797         fText = newText;
798         this.first();
799     }
800     
801     /**
802      * Control debug, trace and dump options.
803      * @internal
804      * @deprecated This API is ICU internal only.
805      */

806     protected static String JavaDoc fDebugEnv = System.getProperty("U_RBBIDEBUG");
807
808     
809     // 32 bit Char value returned from when an iterator has run out of range.
810
// Positive value so fast case (not end, not surrogate) can be checked
811
// with a single test.
812
private static int CI_DONE32 = 0x7fffffff;
813     
814     /**
815      * Move the iterator forward to the next code point, and return that code point,
816      * leaving the iterator positioned at char returned.
817      * For Supplementary chars, the iterator is left positioned at the lead surrogate.
818      * @param ci The character iterator
819      * @return The next code point.
820      */

821      static int CINext32(CharacterIterator JavaDoc ci) {
822         // If the current position is at a surrogate pair, move to the trail surrogate
823
// which leaves it in positon for underlying iterator's next() to work.
824
int c= ci.current();
825         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) {
826             c = ci.next();
827             if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) {
828                c = ci.previous();
829             }
830         }
831
832         // For BMP chars, this next() is the real deal.
833
c = ci.next();
834         
835         // If we might have a lead surrogate, we need to peak ahead to get the trail
836
// even though we don't want to really be positioned there.
837
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
838             c = CINextTrail32(ci, c);
839         }
840         
841         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
842             // We got a supplementary char. Back the iterator up to the postion
843
// of the lead surrogate.
844
ci.previous();
845         }
846         return c;
847    }
848
849     
850     // Out-of-line portion of the in-line Next32 code.
851
// The call site does an initial ci.next() and calls this function
852
// if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
853
// NOTE: we leave the underlying char iterator positioned in the
854
// middle of a surroage pair. ci.next() will work correctly
855
// from there, but the ci.getIndex() will be wrong, and needs
856
// adjustment.
857
private static int CINextTrail32(CharacterIterator JavaDoc ci, int lead) {
858         int retVal = lead;
859         if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
860             char cTrail = ci.next();
861             if (UTF16.isTrailSurrogate(cTrail)) {
862                 retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
863                             (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
864                             UTF16.SUPPLEMENTARY_MIN_VALUE;
865             } else {
866                 ci.previous();
867             }
868         } else {
869             if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
870                 retVal = CI_DONE32;
871             }
872         }
873         return retVal;
874     }
875        
876     private static int CIPrevious32(CharacterIterator JavaDoc ci) {
877         if (ci.getIndex() <= ci.getBeginIndex()) {
878             return CI_DONE32;
879         }
880         char trail = ci.previous();
881         int retVal = trail;
882         if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
883             char lead = ci.previous();
884             if (UTF16.isLeadSurrogate(lead)) {
885                 retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
886                           ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
887                           UTF16.SUPPLEMENTARY_MIN_VALUE;
888             } else {
889                 ci.next();
890             }
891         }
892         return retVal;
893     }
894    
895     static int CICurrent32(CharacterIterator JavaDoc ci) {
896         char lead = ci.current();
897         int retVal = lead;
898         if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
899             return retVal;
900         }
901         if (UTF16.isLeadSurrogate(lead)) {
902             int trail = (int)ci.next();
903             ci.previous();
904             if (UTF16.isTrailSurrogate((char)trail)) {
905                 retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
906                          (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
907                          UTF16.SUPPLEMENTARY_MIN_VALUE;
908             }
909          } else {
910             if (lead == CharacterIterator.DONE) {
911                 if (ci.getIndex() >= ci.getEndIndex()) {
912                     retVal = CI_DONE32;
913                 }
914             }
915          }
916         return retVal;
917     }
918     
919
920     //-----------------------------------------------------------------------------------
921
//
922
// handleNext(void) All forward iteration vectors through this function.
923
// NOTE: This function is overridden by the dictionary base break iterator.
924
// User level API functions go to the dbbi implementation
925
// when the break iterator type is dbbi.
926
// The DBBI implementation sometimes explicitly calls back to here,
927
// its inherited handleNext().
928
//
929
//-----------------------------------------------------------------------------------
930
int handleNext() {
931         return handleNext(fRData.fFTable);
932     }
933
934     /**
935      * The State Machine Engine for moving forward is here.
936      * This function is the heart of the RBBI run time engine.
937      *
938      * @param stateTable
939      * @return the new iterator position
940      *
941      * A note on supplementary characters and the position of underlying
942      * Java CharacterIterator: Normally, a character iterator is positioned at
943      * the char most recently returned by next(). Within this function, when
944      * a supplementary char is being processed, the char iterator is left
945      * sitting on the trail surrogate, in the middle of the code point.
946      * This is different from everywhere else, where an iterator always
947      * points at the lead surrogate of a supplementary.
948      */

949     private int handleNext(short stateTable[]) {
950         int state;
951         short category = 0;
952         int mode;
953         int row;
954         int c;
955         int lookaheadStatus = 0;
956         int lookaheadTagIdx = 0;
957         int result = 0;
958         int initialPosition = 0;
959         int lookaheadResult = 0;
960         boolean lookAheadHardBreak =
961             (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
962         
963         if (fTrace) {
964             System.out.println("Handle Next pos char state category");
965         }
966
967         // No matter what, handleNext alway correctly sets the break tag value.
968
fLastStatusIndexValid = true;
969         fLastRuleStatusIndex = 0;
970
971         // if we're already at the end of the text, return DONE.
972
if (fText == null) {
973             fLastRuleStatusIndex = 0;
974             return BreakIterator.DONE;
975         }
976
977         // Set up the starting char
978
initialPosition = fText.getIndex();
979         result = initialPosition;
980         c = fText.current();
981         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
982             c = CINextTrail32(fText, c);
983             if (c == CI_DONE32) {
984                 fLastRuleStatusIndex = 0;
985                 return BreakIterator.DONE;
986             }
987         }
988
989         // Set the initial state for the state machine
990
state = START_STATE;
991         row = fRData.getRowIndex(state);
992         category = 3;
993         mode = RBBI_RUN;
994         if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
995             category = 2;
996             mode = RBBI_START;
997         }
998
999
1000        // loop until we reach the end of the text or transition to state 0
1001
while (state != STOP_STATE) {
1002            if (c == CI_DONE32) {
1003                // Reached end of input string.
1004
if (mode == RBBI_END) {
1005                    // We have already run the loop one last time with the
1006
// character set to the pseudo {eof} value. Now it is time
1007
// to unconditionally bail out.
1008

1009                    if (lookaheadResult > result) {
1010                        // We ran off the end of the string with a pending
1011
// look-ahead match.
1012
// Treat this as if the look-ahead condition had been
1013
// met, and return
1014
// the match at the / position from the look-ahead rule.
1015
result = lookaheadResult;
1016                        fLastRuleStatusIndex = lookaheadTagIdx;
1017                        lookaheadStatus = 0;
1018                    } else if (result == initialPosition) {
1019                        // Ran off end, no match found.
1020
// move forward one
1021
fText.setIndex(initialPosition);
1022                        CINext32(fText);
1023                    }
1024                    break;
1025                }
1026                // Run the loop one last time with the fake end-of-input character category
1027
mode = RBBI_END;
1028                category = 1;
1029            }
1030            
1031            // Get the char category. An incoming category of 1 or 2 mens that
1032
// we are preset for doing the beginning or end of input, and
1033
// that we shouldn't get a category from an actual text input character.
1034
//
1035
if (mode == RBBI_RUN) {
1036                // look up the current character's character category, which tells us
1037
// which column in the state table to look at.
1038
//
1039
category = (short) fRData.fTrie.getCodePointValue(c);
1040                
1041                // Check the dictionary bit in the character's category.
1042
// Counter is only used by dictionary based iterators (subclasses).
1043
// Chars that need to be handled by a dictionary have a flag bit set
1044
// in their category values.
1045
//
1046
if ((category & 0x4000) != 0) {
1047                    fDictionaryCharCount++;
1048                    // And off the dictionary flag bit.
1049
category &= ~0x4000;
1050                }
1051           }
1052
1053            if (fTrace) {
1054                System.out.print(" " + RBBIDataWrapper.intToString(fText.getIndex(), 5));
1055                System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1056                System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1057            }
1058
1059            // look up a state transition in the state table
1060
// state = row->fNextState[category];
1061
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1062            row = fRData.getRowIndex(state);
1063
1064            // Advance to the next character.
1065
// If this is a beginning-of-input loop iteration, don't advance.
1066
// The next iteration will be processing the first real input character.
1067
if (mode == RBBI_RUN) {
1068                c = (int)fText.next();
1069                if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
1070                    c = CINextTrail32(fText, c);
1071                }
1072            } else {
1073                if (mode == RBBI_START) {
1074                    mode = RBBI_RUN;
1075                }
1076            }
1077             
1078            if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1079                // Match found, common case
1080
result = fText.getIndex();
1081                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
1082                    // The iterator has been left in the middle of a surrogate pair.
1083
// We want the start of it.
1084
result--;
1085                }
1086
1087                // Remember the break status (tag) values.
1088
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
1089            }
1090
1091            if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1092                if (lookaheadStatus != 0
1093                    && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1094                    // Lookahead match is completed. Set the result accordingly, but only
1095
// if no other rule has matched further in the mean time.
1096
result = lookaheadResult;
1097                    fLastRuleStatusIndex = lookaheadTagIdx;
1098                    lookaheadStatus = 0;
1099                    // TODO: make a standalone hard break in a rule work.
1100
if (lookAheadHardBreak) {
1101                        return result;
1102                    }
1103                    // Look-ahead completed, but other rules may match further. Continue on.
1104
// TODO: junk this feature? I don't think it's used anywhere.
1105
continue;
1106                }
1107
1108                lookaheadResult = fText.getIndex();
1109                if (c>=UTF16.SUPPLEMENTARY_MIN_VALUE && c!=CI_DONE32) {
1110                    // The iterator has been left in the middle of a surrogate pair.
1111
// We want the beginning of it.
1112
lookaheadResult--;
1113                }
1114                lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1115                lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
1116                continue;
1117            }
1118
1119
1120            if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1121                // Because this is an accepting state, any in-progress look-ahead match
1122
// is no longer relavant. Clear out the pending lookahead status.
1123
lookaheadStatus = 0;
1124            }
1125            
1126         } // End of state machine main loop
1127

1128        // The state machine is done. Check whether it found a match...
1129

1130        // If the iterator failed to advance in the match engine, force it ahead by one.
1131
// (This really indicates a defect in the break rules. They should always match
1132
// at least one character.)
1133
if (result == initialPosition) {
1134            result = fText.setIndex(initialPosition);
1135            CINext32(fText);
1136            result = fText.getIndex();
1137        }
1138
1139        // Leave the iterator at our result position.
1140
// (we may have advanced beyond the last accepting position chasing after
1141
// longer matches that never completed.)
1142
fText.setIndex(result);
1143        if (fTrace) {
1144            System.out.println("result = " + result);
1145        }
1146        return result;
1147    }
1148
1149    
1150    
1151    private int handlePrevious(short stateTable[]) {
1152        int state;
1153        int category = 0;
1154        int mode;
1155        int row;
1156        int c;
1157        int lookaheadStatus = 0;
1158        int result = 0;
1159        int initialPosition = 0;
1160        int lookaheadResult = 0;
1161        boolean lookAheadHardBreak =
1162            (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1163        
1164        
1165        if (fText == null || stateTable == null) {
1166            return 0;
1167        }
1168        // handlePrevious() never gets the rule status.
1169
// Flag the status as invalid; if the user ever asks for status, we will need
1170
// to back up, then re-find the break position using handleNext(), which does
1171
// get the status value.
1172
fLastStatusIndexValid = false;
1173        fLastRuleStatusIndex = 0;
1174        
1175        // set up the starting char
1176
initialPosition = fText.getIndex();
1177        result = initialPosition;
1178        c = CIPrevious32(fText);
1179        
1180        // Set up the initial state for the state machine
1181
state = START_STATE;
1182        row = fRData.getRowIndex(state);
1183        category = 3; // TODO: obsolete? from the old start/run mode scheme?
1184
mode = RBBI_RUN;
1185        if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
1186            category = 2;
1187            mode = RBBI_START;
1188        }
1189        
1190        if (fTrace) {
1191            System.out.println("Handle Prev pos char state category ");
1192        }
1193        
1194        // loop until we reach the beginning of the text or transition to state 0
1195
//
1196
mainLoop: for (;;) {
1197            innerBlock: {
1198                if (c == CI_DONE32) {
1199                    // Reached end of input string.
1200
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
1201                        // Either this is the old (ICU 3.2 and earlier) format data which
1202
// does not support explicit support for matching {eof}, or
1203
// we have already done the {eof} iteration. Now is the time
1204
// to unconditionally bail out.
1205
if (lookaheadResult < result) {
1206                            // We ran off the end of the string with a pending look-ahead match.
1207
// Treat this as if the look-ahead condition had been met, and return
1208
// the match at the / position from the look-ahead rule.
1209
result = lookaheadResult;
1210                            lookaheadStatus = 0;
1211                        } else if (result == initialPosition) {
1212                            // Ran off start, no match found.
1213
// Move one position (towards the start, since we are doing previous.)
1214
fText.setIndex(initialPosition);
1215                            CIPrevious32(fText);
1216                        }
1217                        break mainLoop;
1218                    }
1219                    mode = RBBI_END;
1220                    category = 1;
1221                }
1222                
1223                if (mode == RBBI_RUN) {
1224                    // look up the current character's category, which tells us
1225
// which column in the state table to look at.
1226
//
1227
category = (short) fRData.fTrie.getCodePointValue(c);
1228                    
1229                    // Check the dictionary bit in the character's category.
1230
// Counter is only used by dictionary based iterators (subclasses).
1231
// Chars that need to be handled by a dictionary have a flag bit set
1232
// in their category values.
1233
//
1234
if ((category & 0x4000) != 0) {
1235                        fDictionaryCharCount++;
1236                        // And off the dictionary flag bit.
1237
category &= ~0x4000;
1238                    }
1239                }
1240                
1241                
1242                if (fTrace) {
1243                    System.out.print(" " + fText.getIndex() + " ");
1244                    if (0x20 <= c && c < 0x7f) {
1245                        System.out.print(" " + c + " ");
1246                    } else {
1247                        System.out.print(" " + Integer.toHexString(c) + " ");
1248                    }
1249                    System.out.println(" " + state + " " + category + " ");
1250                }
1251                
1252                // State Transition - move machine to its next state
1253
//
1254
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1255                row = fRData.getRowIndex(state);
1256                
1257                if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1258                    // Match found, common case, could have lookahead so we move
1259
// on to check it
1260
result = fText.getIndex();
1261                }
1262                
1263                if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1264                    if (lookaheadStatus != 0
1265                            && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1266                        // Lookahead match is completed. Set the result
1267
// accordingly, but only
1268
// if no other rule has matched further in the mean
1269
// time.
1270
result = lookaheadResult;
1271                        lookaheadStatus = 0;
1272                        // TODO: make a standalone hard break in a rule work.
1273

1274                        if (lookAheadHardBreak) {
1275                            break mainLoop;
1276                        }
1277                        // Look-ahead completed, but other rules may match further.
1278
// Continue on.
1279
// TODO: junk this feature? I don't think that it's used anywhere.
1280
break innerBlock;
1281                    }
1282                    // Hit a possible look-ahead match. We are at the
1283
// position of the '/'. Remember this position.
1284
lookaheadResult = fText.getIndex();
1285                    lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1286                    break innerBlock;
1287                }
1288                
1289                // not lookahead...
1290
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1291                    // This is a plain (non-look-ahead) accepting state.
1292
if (!lookAheadHardBreak) {
1293                        // Clear out any pending look-ahead matches,
1294
// but only if not doing the lookAheadHardBreak option
1295
// which needs to force a break no matter what is going
1296
// on with the rest of the match, i.e. we can't abandon
1297
// a partially completed look-ahead match because
1298
// some other rule matched further than the '/' position
1299
// in the look-ahead match.
1300
lookaheadStatus = 0;
1301                    }
1302                }
1303                
1304            } // end of innerBlock. "break innerBlock" in above code comes out here.
1305

1306        
1307            if (state == STOP_STATE) {
1308                // Normal loop exit is here
1309
break mainLoop;
1310            }
1311        
1312            // then move iterator position backwards one character
1313
//
1314
if (mode == RBBI_RUN) {
1315                c = CIPrevious32(fText);
1316            } else {
1317                if (mode == RBBI_START) {
1318                    mode = RBBI_RUN;
1319                }
1320            }
1321        
1322        
1323        } // End of the main loop.
1324

1325        // The state machine is done. Check whether it found a match...
1326
//
1327
// If the iterator failed to advance in the match engine, force it ahead by one.
1328
// (This really indicates a defect in the break rules. They should always match
1329
// at least one character.)
1330
if (result == initialPosition) {
1331            result = fText.setIndex(initialPosition);
1332            CIPrevious32(fText);
1333            result = fText.getIndex();
1334        }
1335        
1336        fText.setIndex(result);
1337        if (fTrace) {
1338            System.out.println("Result = " + result);
1339        }
1340        
1341        return result;
1342    }
1343
1344
1345
1346
1347
1348    //-------------------------------------------------------------------------------
1349

1350    //
1351

1352    // isDictionaryChar Return true if the category lookup for this char
1353

1354    // indicates that it is in the set of dictionary lookup
1355

1356    // chars.
1357

1358    //
1359

1360    // This function is intended for use by dictionary based
1361

1362    // break iterators.
1363

1364    //
1365

1366    //-------------------------------------------------------------------------------
1367

1368    boolean isDictionaryChar(int c) {
1369    
1370        short category = (short) fRData.fTrie.getCodePointValue(c);
1371    
1372        return (category & 0x4000) != 0;
1373    
1374    }
1375
1376}
1377//eof
1378
Popular Tags