RuleBasedBreakIterator


1   //##header 1189099963000 FOUNDATION
2   /*
3    *******************************************************************************
4    * Copyright (C) 2005-2006 International Business Machines Corporation and          *
5    * others. All Rights Reserved.                                                *
6    *******************************************************************************
7    */
8   package com.ibm.icu.text;
9   
10  import java.text.CharacterIterator  ;
11  import java.io.IOException  ;
12  import java.io.InputStream  ;
13  import java.io.OutputStream  ;
14  import java.io.ByteArrayInputStream  ;
15  import java.io.ByteArrayOutputStream  ;
16  
17  import com.ibm.icu.impl.Assert;
18  
19  
20  /**
21   * Rule Based Break Iterator 
22   * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
23   * 
24   * @stable ICU 2.0
25   */
26  public class RuleBasedBreakIterator extends BreakIterator {
27  
28      
29      //=======================================================================
30      // Constructors & Factories
31      //=======================================================================
32      
33      /** 
34       * @internal 
35       * @deprecated This API is ICU internal only.
36       */
37      public RuleBasedBreakIterator() {
38      }
39  
40      /**
41       * Create a break iterator from a precompiled set of rules.
42       * @internal
43       * @deprecated This API is ICU internal only.
44       */
45      public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream   is) throws IOException   {
46          RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
47          This.fRData = RBBIDataWrapper.get(is);
48          return This;   
49      }
50      
51      private RuleBasedBreakIterator(RuleBasedBreakIterator other) {
52          // TODO: check types.
53          fRData = other.fRData;
54          if (fText != null) {
55              fText = (CharacterIterator  )(other.fText.clone());   
56          }
57      }
58  
59      /**
60       * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
61       * @param rules The break rules to be used.
62       * @param parseError  In the event of a syntax error in the rules, provides the location
63       *                    within the rules of the problem.
64       * @param status Information on any errors encountered.
65       * @stable ICU 2.2
66       */
67      public RuleBasedBreakIterator(String   rules)  {
68          init();
69          try {
70              ByteArrayOutputStream   ruleOS = new ByteArrayOutputStream  ();
71              compileRules(rules, ruleOS);
72              byte [] ruleBA = ruleOS.toByteArray();
73              InputStream   ruleIS = new ByteArrayInputStream  (ruleBA);
74              fRData = RBBIDataWrapper.get(ruleIS);
75          } catch (IOException   e) {
76              // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
77              //  causing bogus compiled rules to be produced, but with no compile error raised.
78  //#ifdef FOUNDATION
79              RuntimeException   rte = new RuntimeException  ("RuleBasedBreakIterator rule compilation internal error:");
80  //#else
81  //##            RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error:", e);
82  //#endif
83              throw rte;
84          }
85      }
86      
87      
88      //=======================================================================
89      // Boilerplate
90      //=======================================================================
91      
92      /**
93       * Clones this iterator.
94       * @return A newly-constructed RuleBasedBreakIterator with the same
95       * behavior as this one.
96       * @stable ICU 2.0
97       */
98      public Object   clone()
99      {
100         RuleBasedBreakIterator result = new RuleBasedBreakIterator(this);
101         return result;
102     }
103 
104     /**
105      * Returns true if both BreakIterators are of the same class, have the same
106      * rules, and iterate over the same text.
107      * @stable ICU 2.0
108      */
109     public boolean equals(Object   that) {
110         try {
111             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
112             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
113                 return false;   
114             }
115             if (fRData != null && other.fRData != null && 
116                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
117                 return false;
118             }
119             if (fText == null && other.fText == null) {
120                 return true;   
121             }
122             if (fText == null || other.fText == null) {
123                 return false;   
124             }
125             return fText.equals(other.fText);
126         }
127         catch(ClassCastException   e) {
128             return false;
129         }
130      }
131 
132     /**
133      * Returns the description (rules) used to create this iterator.
134      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
135      * @stable ICU 2.0
136      */
137     public String   toString() {
138         String     retStr = null;
139         if (fRData != null) {
140             retStr =  fRData.fRuleSource;
141         }
142         return retStr;
143     }
144 
145     /**
146      * Compute a hashcode for this BreakIterator
147      * @return A hash code
148      * @stable ICU 2.0
149      */
150     public int hashCode()
151     {
152         return fRData.fRuleSource.hashCode(); 
153     }
154 
155     
156     /** 
157      * Tag value for "words" that do not fit into any of other categories. 
158      * Includes spaces and most punctuation. 
159      * @draft ICU 3.0 
160      * @provisional This is a draft API and might change in a future release of ICU.
161      */
162     public static final int WORD_NONE           = 0;
163 
164     /**
165      * Upper bound for tags for uncategorized words. 
166      * @draft ICU 3.0 
167      * @provisional This is a draft API and might change in a future release of ICU.
168      */
169     public static final int WORD_NONE_LIMIT     = 100;
170 
171     /**
172      * Tag value for words that appear to be numbers, lower limit. 
173      * @draft ICU 3.0 
174      * @provisional This is a draft API and might change in a future release of ICU.
175      */
176     public static final int WORD_NUMBER         = 100;
177 
178     /** 
179      * Tag value for words that appear to be numbers, upper limit.
180      * @draft ICU 3.0 
181      * @provisional This is a draft API and might change in a future release of ICU.
182      */
183     public static final int WORD_NUMBER_LIMIT   = 200;
184 
185     /** 
186      * Tag value for words that contain letters, excluding
187      * hiragana, katakana or ideographic characters, lower limit. 
188      * @draft ICU 3.0 
189      * @provisional This is a draft API and might change in a future release of ICU.
190      */
191     public static final int WORD_LETTER         = 200;
192 
193     /** 
194      * Tag value for words containing letters, upper limit 
195      * @draft ICU 3.0 
196      * @provisional This is a draft API and might change in a future release of ICU.
197      */
198     public static final int WORD_LETTER_LIMIT   = 300;
199 
200     /** 
201      * Tag value for words containing kana characters, lower limit
202      * @draft ICU 3.0 
203      * @provisional This is a draft API and might change in a future release of ICU.
204      */
205     public static final int WORD_KANA           = 300;
206 
207     /** 
208      * Tag value for words containing kana characters, upper limit
209      * @draft ICU 3.0 
210      * @provisional This is a draft API and might change in a future release of ICU.
211      */
212     public static final int WORD_KANA_LIMIT     = 400;
213 
214     /**
215      * Tag value for words containing ideographic characters, lower limit
216      * @draft ICU 3.0 
217      * @provisional This is a draft API and might change in a future release of ICU.
218      */
219     public static final int WORD_IDEO           = 400;
220 
221     /**
222      * Tag value for words containing ideographic characters, upper limit
223      * @draft ICU 3.0 
224      * @provisional This is a draft API and might change in a future release of ICU.
225      */
226     public static final int WORD_IDEO_LIMIT     = 500;
227 
228    
229     
230     
231     private static final int  START_STATE = 1;     // The state number of the starting state
232     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
233     
234     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
235     //               of user text.  A variable with this enum type keeps track of where we
236     //               are.  The state machine only fetches user text input while in RUN mode.
237     private static final int  RBBI_START  = 0;
238     private static final int  RBBI_RUN    = 1;
239     private static final int  RBBI_END   = 2;
240 
241     /**
242      * The character iterator through which this BreakIterator accesses the text.
243      * 
244      * @internal
245      * @deprecated This API is ICU internal only.
246      */
247     private CharacterIterator     fText = new java.text.StringCharacterIterator  ("");
248     
249     /**
250      * The rule data for this BreakIterator instance
251      * @internal
252      * @deprecated This API is ICU internal only.
253      */
254     protected RBBIDataWrapper     fRData;
255     
256     /** Index of the Rule {tag} values for the most recent match. 
257      *  @internal
258      * @deprecated This API is ICU internal only.
259      */
260     private int                 fLastRuleStatusIndex;
261 
262     /**
263      * Rule tag value valid flag.
264      * Some iterator operations don't intrinsically set the correct tag value.
265      * This flag lets us lazily compute the value if we are ever asked for it.
266      * @internal
267      * @deprecated This API is ICU internal only.
268      */
269     private boolean             fLastStatusIndexValid;
270     
271     /**
272      * Counter for the number of characters encountered with the "dictionary"
273      *   flag set.  Normal RBBI iterators don't use it, although the code
274      *   for updating it is live.  Dictionary Based break iterators (a subclass
275      *   of us) access this field directly.
276      * @internal
277      * @deprecated This API is ICU internal only.
278      */
279      protected int fDictionaryCharCount;
280     
281     /**
282      * Debugging flag.  Trace operation of state machine when true.
283      * @internal
284      * @deprecated This API is ICU internal only.
285      */
286     public static boolean       fTrace;
287     
288     
289     /**
290      * Dump the contents of the state table and character classes for this break iterator.
291      * For debugging only.
292      * @internal
293      * @deprecated This API is ICU internal only.
294      */
295     public void dump() {
296         this.fRData.dump();   
297     }
298 
299     private static boolean debugInitDone = false;
300     
301     private void init() {
302         fLastStatusIndexValid = true;
303         fDictionaryCharCount  = 0;
304 
305  
306         if (debugInitDone == false) {
307             String   debugEnv = System.getProperty("U_RBBIDEBUG");
308             if (debugEnv!=null && debugEnv.indexOf("trace")>=0) {
309                 fTrace = true;
310             }
311             debugInitDone = true;
312         }
313     }
314 
315     private static void compileRules(String   rules, OutputStream   ruleBinary) throws IOException   {
316         RBBIRuleBuilder.compileRules(rules, ruleBinary);
317     }
318     
319     //=======================================================================
320     // BreakIterator overrides
321     //=======================================================================
322 
323     /**
324      * Sets the current iteration position to the beginning of the text.
325      * (i.e., the CharacterIterator's starting offset).
326      * @return The offset of the beginning of the text.
327      * @stable ICU 2.0
328      */
329     public int first() {
330         fLastRuleStatusIndex  = 0;
331         fLastStatusIndexValid = true;
332         if (fText == null) {
333             return BreakIterator.DONE;
334         }
335         fText.first();
336         return fText.getIndex();
337     }
338     
339     
340     /**
341      * Sets the current iteration position to the end of the text.
342      * (i.e., the CharacterIterator's ending offset).
343      * @return The text's past-the-end offset.
344      * @stable ICU 2.0
345      */
346     public int last() {
347         if (fText == null) {
348             fLastRuleStatusIndex  = 0;
349             fLastStatusIndexValid = true;
350             return BreakIterator.DONE;
351         }
352 
353         // I'm not sure why, but t.last() returns the offset of the last character,
354         // rather than the past-the-end offset
355         //
356         //   (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
357         //     will work correctly.)
358 
359 
360         fLastStatusIndexValid = false;
361         int pos = fText.getEndIndex();
362         fText.setIndex(pos);
363         return pos;
364     }
365     
366     
367     /**
368      * Advances the iterator either forward or backward the specified number of steps.
369      * Negative values move backward, and positive values move forward.  This is
370      * equivalent to repeatedly calling next() or previous().
371      * @param n The number of steps to move.  The sign indicates the direction
372      * (negative is backwards, and positive is forwards).
373      * @return The character offset of the boundary position n boundaries away from
374      * the current one.
375      * @stable ICU 2.0
376      */
377     public int next(int n) {
378         int result = current();
379         while (n > 0) {
380             result = handleNext();
381             --n;
382         }
383         while (n < 0) {
384             result = previous();
385             ++n;
386         }
387         return result;
388     }
389     
390     
391     /**
392      * Advances the iterator to the next boundary position.
393      * @return The position of the first boundary after this one.
394      * @stable ICU 2.0
395      */
396     public int next() {
397         return handleNext();
398     }
399     
400     
401     /**
402      * Moves the iterator backwards, to the last boundary preceding this one.
403      * @return The position of the last boundary position preceding this one.
404      * @stable ICU 2.0
405      */
406     public int previous() {
407         // if we're already sitting at the beginning of the text, return DONE
408         if (fText == null || current() == fText.getBeginIndex()) {
409             fLastRuleStatusIndex  = 0;
410             fLastStatusIndexValid = true;
411             return BreakIterator.DONE;
412         }
413 
414         if (fRData.fSRTable != null || fRData.fSFTable != null) {
415             return handlePrevious(fRData.fRTable);
416         }
417 
418         // old rule syntax
419         // set things up.  handlePrevious() will back us up to some valid
420         // break position before the current position (we back our internal
421         // iterator up one step to prevent handlePrevious() from returning
422         // the current position), but not necessarily the last one before
423         // where we started
424 
425         int       start = current();
426 
427         CIPrevious32(fText);
428         int       lastResult    = handlePrevious(fRData.fRTable);
429         if (lastResult == BreakIterator.DONE) {
430             lastResult = fText.getBeginIndex();
431             fText.setIndex(lastResult);
432         }
433         int       result        = lastResult;
434         int       lastTag       = 0;
435         boolean   breakTagValid = false;
436 
437         // iterate forward from the known break position until we pass our
438         // starting point.  The last break position before the starting
439         // point is our return value
440 
441         for (;;) {
442             result         = handleNext();
443             if (result == BreakIterator.DONE || result >= start) {
444                 break;
445             }
446             lastResult     = result;
447             lastTag        = fLastRuleStatusIndex;
448             breakTagValid  = true;
449         }
450 
451         // fLastBreakTag wants to have the value for section of text preceding
452         // the result position that we are to return (in lastResult.)  If
453         // the backwards rules overshot and the above loop had to do two or more
454         // handleNext()s to move up to the desired return position, we will have a valid
455         // tag value. But, if handlePrevious() took us to exactly the correct result positon,
456         // we wont have a tag value for that position, which is only set by handleNext().
457 
458         // set the current iteration position to be the last break position
459         // before where we started, and then return that value
460         fText.setIndex(lastResult);
461         fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
462         fLastStatusIndexValid = breakTagValid;
463         return lastResult;
464     }
465     /**
466      * Sets the iterator to refer to the first boundary position following
467      * the specified position.
468      * @param offset The position from which to begin searching for a break position.
469      * @return The position of the first break after the current position.
470      * @stable ICU 2.0
471      */
472     public int following(int offset) {
473         // if the offset passed in is already past the end of the text,
474         // just return DONE; if it's before the beginning, return the
475         // text's starting offset
476         fLastRuleStatusIndex  = 0;
477         fLastStatusIndexValid = true;
478         if (fText == null || offset >= fText.getEndIndex()) {
479             last();
480             return next();
481         }
482         else if (offset < fText.getBeginIndex()) {
483             return first();
484         }
485 
486         // otherwise, set our internal iteration position (temporarily)
487         // to the position passed in.  If this is the _beginning_ position,
488         // then we can just use next() to get our return value
489 
490         int result = 0;
491 
492         if (fRData.fSRTable != null) {
493             // Safe Point Reverse rules exist.
494             //   This allows us to use the optimum algorithm.
495             fText.setIndex(offset);
496             // move forward one codepoint to prepare for moving back to a
497             // safe point.
498             // this handles offset being between a supplementary character
499             CINext32(fText);
500             // handlePrevious will move most of the time to < 1 boundary away
501             handlePrevious(fRData.fSRTable);
502             result = next();
503             while (result <= offset) {
504                 result = next();
505             }
506             return result;
507         }
508         if (fRData.fSFTable != null) {
509             // No Safe point reverse table, but there is a safe pt forward table.
510             // 
511             fText.setIndex(offset);
512             CIPrevious32(fText);
513             // handle next will give result >= offset
514             handleNext(fRData.fSFTable);
515             // previous will give result 0 or 1 boundary away from offset,
516             // most of the time
517             // we have to
518             int oldresult = previous();
519             while (oldresult > offset) {
520                 result = previous();
521                 if (result <= offset) {
522                     return oldresult;
523                 }
524                 oldresult = result;
525             }
526             result = next();
527             if (result <= offset) {
528                 return next();
529             }
530             return result;
531         }
532         // otherwise, we have to sync up first.  Use handlePrevious() to back
533         // us up to a known break position before the specified position (if
534         // we can determine that the specified position is a break position,
535         // we don't back up at all).  This may or may not be the last break
536         // position at or before our starting position.  Advance forward
537         // from here until we've passed the starting position.  The position
538         // we stop on will be the first break position after the specified one.
539         // old rule syntax
540 
541         fText.setIndex(offset);
542         if (offset == fText.getBeginIndex()) {
543             return handleNext();
544         }
545         result = previous();
546 
547         while (result != BreakIterator.DONE && result <= offset) {
548             result = next();
549         }
550 
551         return result;
552     }
553     /**
554      * Sets the iterator to refer to the last boundary position before the
555      * specified position.
556      * @param offset The position to begin searching for a break from.
557      * @return The position of the last boundary before the starting position.
558      * @stable ICU 2.0
559      */
560     public int preceding(int offset) {
561         // if the offset passed in is already past the end of the text,
562         // just return DONE; if it's before the beginning, return the
563 
564         // text's starting offset
565         if (fText == null || offset > fText.getEndIndex()) {
566             // return BreakIterator::DONE;
567             return last();
568         }
569         else if (offset < fText.getBeginIndex()) {
570             return first();
571         }
572 
573         // if we start by updating the current iteration position to the
574         // position specified by the caller, we can just use previous()
575         // to carry out this operation
576 
577         int  result;
578         if (fRData.fSFTable != null) {
579             /// todo synwee
580             // new rule syntax
581             fText.setIndex(offset);
582             // move backwards one codepoint to prepare for moving forwards to a
583             // safe point.
584             // this handles offset being between a supplementary character
585             CIPrevious32(fText);
586             handleNext(fRData.fSFTable);
587             result = previous();
588             while (result >= offset) {
589                 result = previous();
590             }
591             return result;
592         }
593         if (fRData.fSRTable != null) {
594             // backup plan if forward safe table is not available
595             fText.setIndex(offset);
596             CINext32(fText);
597             // handle previous will give result <= offset
598             handlePrevious(fRData.fSRTable);
599 
600             // next will give result 0 or 1 boundary away from offset,
601             // most of the time
602             // we have to
603             int oldresult = next();
604             while (oldresult < offset) {
605                 result = next();
606                 if (result >= offset) {
607                     return oldresult;
608                 }
609                 oldresult = result;
610             }
611             result = previous();
612             if (result >= offset) {
613                 return previous();
614             }
615             return result;
616         }
617 
618         // old rule syntax
619         fText.setIndex(offset);
620         return previous();
621     }
622 
623     /**
624      * Throw IllegalArgumentException unless begin <= offset < end.
625      * @stable ICU 2.0
626      */
627     protected static final void checkOffset(int offset, CharacterIterator   text) {
628         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
629             throw new IllegalArgumentException  ("offset out of bounds");
630         }
631     }
632 
633 
634 /**
635  * Returns true if the specfied position is a boundary position.  As a side
636  * effect, leaves the iterator pointing to the first boundary position at
637  * or after "offset".
638  * @param offset the offset to check.
639  * @return True if "offset" is a boundary position.
640  * @stable ICU 2.0
641  */
642 public boolean isBoundary(int offset) {
643     checkOffset(offset, fText);
644     
645     // the beginning index of the iterator is always a boundary position by definition
646     if (offset == fText.getBeginIndex()) {
647         first();       // For side effects on current position, tag values.
648         return true;
649     }
650 
651     if (offset == fText.getEndIndex()) {
652         last();       // For side effects on current position, tag values.
653         return true;
654     }
655 
656     // otherwise, we can use following() on the position before the specified
657     // one and return true if the position we get back is the one the user
658     // specified
659     
660     // return following(offset - 1) == offset;
661     // TODO:  check whether it is safe to revert to the simpler offset-1 code
662     //         The safe rules may take care of unpaired surrogates ok.
663     fText.setIndex(offset);
664     CIPrevious32(fText);
665     int  pos = fText.getIndex();
666     boolean result = following(pos) == offset;
667     return result;
668 }
669 
670 /**
671  * Returns the current iteration position.
672  * @return The current iteration position.
673  * @stable ICU 2.0
674  */
675 public int current() {
676     return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
677     }
678 
679 
680 
681 private void makeRuleStatusValid() {
682     if (fLastStatusIndexValid == false) {
683         //  No cached status is available.
684         if (fText == null || current() == fText.getBeginIndex()) {
685             //  At start of text, or there is no text.  Status is always zero.
686             fLastRuleStatusIndex = 0;
687             fLastStatusIndexValid = true;
688         } else {
689             //  Not at start of text.  Find status the tedious way.
690             int pa = current();
691             previous();
692             int pb = next();
693             Assert.assrt (pa == pb);
694         }
695         Assert.assrt(fLastStatusIndexValid == true);
696         Assert.assrt(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fRData.fStatusTable.length);
697     }
698 }
699 
700 
701 /**
702  * Return the status tag from the break rule that determined the most recently
703  * returned break position.  The values appear in the rule source
704  * within brackets, {123}, for example.  For rules that do not specify a
705  * status, a default value of 0 is returned.  If more than one rule applies,
706  * the numerically largest of the possible status values is returned.
707  * <p>
708  * Of the standard types of ICU break iterators, only the word break
709  * iterator provides status values.  The values are defined in
710  * class RuleBasedBreakIterator, and allow distinguishing between words
711  * that contain alphabetic letters, "words" that appear to be numbers,
712  * punctuation and spaces, words containing ideographic characters, and
713  * more.  Call <code>getRuleStatus</code> after obtaining a boundary
714  * position from <code>next()<code>, <code>previous()</code>, or 
715  * any other break iterator functions that returns a boundary position.
716  * <p>
717  * @return the status from the break rule that determined the most recently
718  * returned break position.
719  *
720  * @draft ICU 3.0
721  * @provisional This is a draft API and might change in a future release of ICU.
722  */
723 
724 public int  getRuleStatus() {
725     makeRuleStatusValid();
726     //   Status records have this form:
727     //           Count N         <--  fLastRuleStatusIndex points here.
728     //           Status val 0
729     //           Status val 1
730     //              ...
731     //           Status val N-1  <--  the value we need to return
732     //   The status values are sorted in ascending order.
733     //   This function returns the last (largest) of the array of status values.
734     int  idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
735     int  tagVal = fRData.fStatusTable[idx];
736 
737     return tagVal;
738 }
739 
740 
741 
742 /**
743  * Get the status (tag) values from the break rule(s) that determined the most 
744  * recently returned break position.  The values appear in the rule source
745  * within brackets, {123}, for example.  The default status value for rules
746  * that do not explicitly provide one is zero.
747  * <p>
748  * The status values used by the standard ICU break rules are defined
749  * as public constants in class RuleBasedBreakIterator.
750  * <p>
751  * If the size  of the output array is insufficient to hold the data,
752  *  the output will be truncated to the available length.  No exception
753  *  will be thrown.
754  *
755  * @param fillInArray an array to be filled in with the status values.  
756  * @return          The number of rule status values from rules that determined 
757  *                  the most recent boundary returned by the break iterator.
758  *                  In the event that the array is too small, the return value
759  *                  is the total number of status values that were available,
760  *                  not the reduced number that were actually returned.
761  * @draft ICU 3.0
762  * @provisional This is a draft API and might change in a future release of ICU.
763  */
764 public int getRuleStatusVec(int[] fillInArray) {
765     makeRuleStatusValid();
766     int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
767     if (fillInArray != null) {  
768         int numToCopy = Math.min(numStatusVals, fillInArray.length);
769         for (int i=0; i<numToCopy; i++) {
770             fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
771         }
772     }
773     return numStatusVals;
774  }
775 
776 
777 /**
778  * Return a CharacterIterator over the text being analyzed.  This version
779  * of this method returns the actual CharacterIterator we're using internally.
780  * Changing the state of this iterator can have undefined consequences.  If
781  * you need to change it, clone it first.
782  * @return An iterator over the text being analyzed.
783  * @stable ICU 2.0
784  */
785     public CharacterIterator   getText() {
786         return fText;
787     }
788 
789 
790     /**
791      * Set the iterator to analyze a new piece of text.  This function resets
792      * the current iteration position to the beginning of the text.
793      * @param newText An iterator over the text to analyze.
794      * @stable ICU 2.0
795      */
796     public void setText(CharacterIterator   newText) {
797         fText = newText;
798         this.first();
799     }
800     
801     /**
802      * Control debug, trace and dump options.
803      * @internal
804      * @deprecated This API is ICU internal only.
805      */
806     protected static String   fDebugEnv = System.getProperty("U_RBBIDEBUG");
807 
808     
809     // 32 bit Char value returned from when an iterator has run out of range.
810     //     Positive value so fast case (not end, not surrogate) can be checked
811     //     with a single test.
812     private static int CI_DONE32 = 0x7fffffff;
813     
814     /**
815      * Move the iterator forward to the next code point, and return that code point,
816      *   leaving the iterator positioned at char returned.
817      *   For Supplementary chars, the iterator is left positioned at the lead surrogate.
818      * @param ci  The character iterator
819      * @return    The next code point.
820      */
821      static int CINext32(CharacterIterator   ci) {
822         // If the current position is at a surrogate pair, move to the trail surrogate
823         //   which leaves it in positon for underlying iterator's next() to work.
824         int c= ci.current();
825         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) {
826             c = ci.next();   
827             if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) {
828                c = ci.previous();   
829             }
830         }
831 
832         // For BMP chars, this next() is the real deal.
833         c = ci.next();
834         
835         // If we might have a lead surrogate, we need to peak ahead to get the trail 
836         //  even though we don't want to really be positioned there.
837         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
838             c = CINextTrail32(ci, c);   
839         }
840         
841         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
842             // We got a supplementary char.  Back the iterator up to the postion
843             // of the lead surrogate.
844             ci.previous();   
845         }
846         return c;
847    }
848 
849     
850     // Out-of-line portion of the in-line Next32 code.
851     // The call site does an initial ci.next() and calls this function
852     //    if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
853     // NOTE:  we leave the underlying char iterator positioned in the
854     //        middle of a surroage pair.  ci.next() will work correctly
855     //        from there, but the ci.getIndex() will be wrong, and needs
856     //        adjustment.
857     private static int CINextTrail32(CharacterIterator   ci, int lead) {
858         int retVal = lead;
859         if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
860             char  cTrail = ci.next();
861             if (UTF16.isTrailSurrogate(cTrail)) {
862                 retVal = ((lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
863                             (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
864                             UTF16.SUPPLEMENTARY_MIN_VALUE;
865             } else {
866                 ci.previous();
867             }
868         } else {
869             if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
870                 retVal = CI_DONE32;
871             }
872         }
873         return retVal;
874     }
875        
876     private static int CIPrevious32(CharacterIterator   ci) {
877         if (ci.getIndex() <= ci.getBeginIndex()) {
878             return CI_DONE32;   
879         }
880         char trail = ci.previous();
881         int retVal = trail;
882         if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
883             char lead = ci.previous();
884             if (UTF16.isLeadSurrogate(lead)) {
885                 retVal = (((int)lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
886                           ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
887                           UTF16.SUPPLEMENTARY_MIN_VALUE;
888             } else {
889                 ci.next();
890             }           
891         }
892         return retVal;
893     }
894    
895     static int CICurrent32(CharacterIterator   ci) {
896         char  lead   = ci.current();
897         int   retVal = lead;
898         if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
899             return retVal;   
900         }
901         if (UTF16.isLeadSurrogate(lead)) {
902             int  trail = (int)ci.next();
903             ci.previous();
904             if (UTF16.isTrailSurrogate((char)trail)) {
905                 retVal = ((lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
906                          (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
907                          UTF16.SUPPLEMENTARY_MIN_VALUE;
908             }
909          } else {
910             if (lead == CharacterIterator.DONE) {
911                 if (ci.getIndex() >= ci.getEndIndex())   {
912                     retVal = CI_DONE32;   
913                 }
914             }
915          }
916         return retVal;
917     }
918     
919 
920     //-----------------------------------------------------------------------------------
921     //
922     //      handleNext(void)    All forward iteration vectors through this function.
923     //                          NOTE:  This function is overridden by the dictionary base break iterator.
924     //                                 User level API functions go to the dbbi implementation
925     //                                     when the break iterator type is dbbi.
926     //                                 The DBBI implementation sometimes explicitly calls back to here, 
927     //                                     its inherited handleNext().
928     //                      
929     //-----------------------------------------------------------------------------------
930     int handleNext() {
931         return handleNext(fRData.fFTable);
932     }
933 
934     /**
935      * The State Machine Engine for moving forward is here.
936      * This function is the heart of the RBBI run time engine.
937      * 
938      * @param stateTable
939      * @return the new iterator position
940      * 
941      * A note on supplementary characters and the position of underlying
942      * Java CharacterIterator:   Normally, a character iterator is positioned at
943      * the char most recently returned by next().  Within this function, when
944      * a supplementary char is being processed, the char iterator is left
945      * sitting on the trail surrogate, in the middle of the code point.
946      * This is different from everywhere else, where an iterator always
947      * points at the lead surrogate of a supplementary.
948      */
949     private int handleNext(short stateTable[]) {
950         int               state;
951         short             category        = 0;
952         int               mode;
953         int               row;
954         int               c;
955         int               lookaheadStatus = 0;
956         int               lookaheadTagIdx = 0;
957         int               result          = 0;
958         int               initialPosition = 0;
959         int               lookaheadResult = 0;
960         boolean          lookAheadHardBreak = 
961             (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
962         
963         if (fTrace) {
964             System.out.println("Handle Next   pos      char  state category");
965         }
966 
967         // No matter what, handleNext alway correctly sets the break tag value.
968         fLastStatusIndexValid = true;
969         fLastRuleStatusIndex  = 0;
970 
971         // if we're already at the end of the text, return DONE.
972         if (fText == null) {
973             fLastRuleStatusIndex = 0;
974             return BreakIterator.DONE;
975         }
976 
977         // Set up the starting char
978         initialPosition = fText.getIndex();
979         result          = initialPosition;
980         c               = fText.current();
981         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
982             c = CINextTrail32(fText, c);
983             if (c == CI_DONE32) {
984                 fLastRuleStatusIndex = 0;
985                 return BreakIterator.DONE;
986             }
987         }
988 
989         // Set the initial state for the state machine
990         state           = START_STATE;
991         row             = fRData.getRowIndex(state); 
992         category        = 3;
993         mode            = RBBI_RUN;
994         if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
995             category = 2;
996             mode     = RBBI_START;
997         }
998 
999 
1000        // loop until we reach the end of the text or transition to state 0
1001        while (state != STOP_STATE) {
1002            if (c == CI_DONE32) {
1003                // Reached end of input string.
1004                if (mode == RBBI_END) {
1005                    // We have already run the loop one last time with the
1006                    // character set to the pseudo {eof} value. Now it is time
1007                    // to unconditionally bail out.
1008
1009                    if (lookaheadResult > result) {
1010                        // We ran off the end of the string with a pending
1011                        // look-ahead match.
1012                        // Treat this as if the look-ahead condition had been
1013                        // met, and return
1014                        // the match at the / position from the look-ahead rule.
1015                        result = lookaheadResult;
1016                        fLastRuleStatusIndex = lookaheadTagIdx;
1017                        lookaheadStatus = 0;
1018                    } else if (result == initialPosition) {
1019                        // Ran off end, no match found.
1020                        // move forward one
1021                        fText.setIndex(initialPosition);
1022                        CINext32(fText);
1023                    }
1024                    break;
1025                }
1026                // Run the loop one last time with the fake end-of-input character category
1027                mode = RBBI_END;
1028                category = 1;
1029            }
1030            
1031            // Get the char category.  An incoming category of 1 or 2 mens that
1032            //      we are preset for doing the beginning or end of input, and
1033            //      that we shouldn't get a category from an actual text input character.
1034            //
1035            if (mode == RBBI_RUN) {
1036                // look up the current character's character category, which tells us
1037                // which column in the state table to look at.
1038                //
1039                category = (short) fRData.fTrie.getCodePointValue(c);
1040                
1041                // Check the dictionary bit in the character's category.
1042                //    Counter is only used by dictionary based iterators (subclasses).
1043                //    Chars that need to be handled by a dictionary have a flag bit set
1044                //    in their category values.
1045                //
1046                if ((category & 0x4000) != 0)  {
1047                    fDictionaryCharCount++;
1048                    //  And off the dictionary flag bit.
1049                    category &= ~0x4000;
1050                }
1051           }
1052
1053            if (fTrace) {
1054                System.out.print("            " +  RBBIDataWrapper.intToString(fText.getIndex(), 5)); 
1055                System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1056                System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1057            }
1058
1059            // look up a state transition in the state table
1060            //     state = row->fNextState[category];
1061            state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1062            row   = fRData.getRowIndex(state);  
1063
1064            // Advance to the next character.  
1065            // If this is a beginning-of-input loop iteration, don't advance.
1066            //    The next iteration will be processing the first real input character.
1067            if (mode == RBBI_RUN) {
1068                c = (int)fText.next(); 
1069                if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
1070                    c = CINextTrail32(fText, c);
1071                }
1072            } else {
1073                if (mode == RBBI_START) {
1074                    mode = RBBI_RUN;
1075                }
1076            }
1077             
1078            if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1079                // Match found, common case
1080                result = fText.getIndex();
1081                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
1082                    // The iterator has been left in the middle of a surrogate pair.
1083                    // We want the start of it.
1084                    result--;
1085                }
1086
1087                //  Remember the break status (tag) values.
1088                fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
1089            }
1090
1091            if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1092                if (lookaheadStatus != 0
1093                    && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1094                    // Lookahead match is completed.  Set the result accordingly, but only
1095                    // if no other rule has matched further in the mean time.
1096                    result               = lookaheadResult;
1097                    fLastRuleStatusIndex = lookaheadTagIdx;
1098                    lookaheadStatus      = 0;
1099                    // TODO: make a standalone hard break in a rule work.
1100                    if (lookAheadHardBreak) {
1101                        return result;
1102                    }
1103                    // Look-ahead completed, but other rules may match further.  Continue on.
1104                    //   TODO:  junk this feature?  I don't think it's used anywhere.
1105                    continue;
1106                }
1107
1108                lookaheadResult = fText.getIndex();
1109                if (c>=UTF16.SUPPLEMENTARY_MIN_VALUE && c!=CI_DONE32) {
1110                    // The iterator has been left in the middle of a surrogate pair.
1111                    // We want the beginning  of it.
1112                    lookaheadResult--;
1113                }
1114                lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1115                lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
1116                continue;
1117            }
1118
1119
1120            if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1121                // Because this is an accepting state, any in-progress look-ahead match
1122                //   is no longer relavant.  Clear out the pending lookahead status.
1123                lookaheadStatus = 0; 
1124            }
1125            
1126         }        // End of state machine main loop
1127
1128        // The state machine is done.  Check whether it found a match...
1129
1130        // If the iterator failed to advance in the match engine, force it ahead by one.
1131        //   (This really indicates a defect in the break rules.  They should always match
1132        //    at least one character.)
1133        if (result == initialPosition) {
1134            result = fText.setIndex(initialPosition);
1135            CINext32(fText);
1136            result = fText.getIndex();
1137        }
1138
1139        // Leave the iterator at our result position.
1140        //   (we may have advanced beyond the last accepting position chasing after
1141        //    longer matches that never completed.)
1142        fText.setIndex(result);
1143        if (fTrace) {
1144            System.out.println("result = " + result);
1145        }
1146        return result;
1147    }
1148
1149    
1150    
1151    private int handlePrevious(short stateTable[]) {
1152        int            state;
1153        int            category           = 0;
1154        int            mode;
1155        int            row;        
1156        int            c;
1157        int            lookaheadStatus    = 0;
1158        int            result             = 0;
1159        int            initialPosition    = 0;
1160        int            lookaheadResult    = 0;
1161        boolean        lookAheadHardBreak = 
1162            (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1163        
1164        
1165        if (fText == null || stateTable == null) {
1166            return 0;
1167        }
1168        // handlePrevious() never gets the rule status.
1169        // Flag the status as invalid; if the user ever asks for status, we will need
1170        // to back up, then re-find the break position using handleNext(), which does
1171        // get the status value.
1172        fLastStatusIndexValid = false;
1173        fLastRuleStatusIndex  = 0;
1174        
1175        // set up the starting char
1176        initialPosition = fText.getIndex();
1177        result          = initialPosition;
1178        c               = CIPrevious32(fText);
1179        
1180        // Set up the initial state for the state machine
1181        state = START_STATE;
1182        row = fRData.getRowIndex(state);
1183        category = 3;   // TODO:  obsolete?  from the old start/run mode scheme?
1184        mode     = RBBI_RUN;
1185        if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
1186            category = 2;
1187            mode     = RBBI_START;
1188        }
1189        
1190        if (fTrace) {
1191            System.out.println("Handle Prev   pos   char  state category ");
1192        }
1193        
1194        // loop until we reach the beginning of the text or transition to state 0
1195        //
1196        mainLoop: for (;;) {
1197            innerBlock: {
1198                if (c == CI_DONE32) {
1199                    // Reached end of input string.
1200                    if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
1201                        // Either this is the old (ICU 3.2 and earlier) format data which
1202                        // does not support explicit support for matching {eof}, or
1203                        // we have already done the {eof} iteration.  Now is the time
1204                        // to unconditionally bail out.
1205                        if (lookaheadResult < result) {
1206                            // We ran off the end of the string with a pending look-ahead match.
1207                            // Treat this as if the look-ahead condition had been met, and return
1208                            //  the match at the / position from the look-ahead rule.
1209                            result = lookaheadResult;
1210                            lookaheadStatus = 0;
1211                        } else if (result == initialPosition) {
1212                            // Ran off start, no match found.
1213                            // Move one position (towards the start, since we are doing previous.)
1214                            fText.setIndex(initialPosition);
1215                            CIPrevious32(fText);
1216                        }
1217                        break mainLoop;
1218                    }
1219                    mode = RBBI_END;
1220                    category = 1;
1221                }
1222                
1223                if (mode == RBBI_RUN) {
1224                    // look up the current character's category, which tells us
1225                    // which column in the state table to look at.
1226                    //
1227                    category = (short) fRData.fTrie.getCodePointValue(c);
1228                    
1229                    // Check the dictionary bit in the character's category.
1230                    //    Counter is only used by dictionary based iterators (subclasses).
1231                    //    Chars that need to be handled by a dictionary have a flag bit set
1232                    //    in their category values.
1233                    //
1234                    if ((category & 0x4000) != 0)  {
1235                        fDictionaryCharCount++;
1236                        //  And off the dictionary flag bit.
1237                        category &= ~0x4000;
1238                    }
1239                }
1240                
1241                
1242                if (fTrace) {
1243                    System.out.print("             " + fText.getIndex() + "   ");
1244                    if (0x20 <= c && c < 0x7f) {
1245                        System.out.print("  " + c + "  ");
1246                    } else {
1247                        System.out.print(" " + Integer.toHexString(c) + " ");
1248                    }
1249                    System.out.println(" " + state + "  " + category + " ");
1250                }
1251                
1252                // State Transition - move machine to its next state
1253                //
1254                state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1255                row = fRData.getRowIndex(state);
1256                
1257                if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1258                    // Match found, common case, could have lookahead so we move
1259                    // on to check it
1260                    result = fText.getIndex();
1261                }
1262                
1263                if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1264                    if (lookaheadStatus != 0
1265                            && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1266                        // Lookahead match is completed. Set the result
1267                        // accordingly, but only
1268                        // if no other rule has matched further in the mean
1269                        // time.
1270                        result = lookaheadResult;
1271                        lookaheadStatus = 0;
1272                        // TODO: make a standalone hard break in a rule work.
1273                        
1274                        if (lookAheadHardBreak) {
1275                            break mainLoop;
1276                        }
1277                        // Look-ahead completed, but other rules may match further.
1278                        // Continue on.
1279                        // TODO: junk this feature?  I don't think that it's used anywhere.
1280                        break innerBlock;
1281                    }
1282                    // Hit a possible look-ahead match. We are at the
1283                    // position of the '/'. Remember this position.
1284                    lookaheadResult = fText.getIndex();
1285                    lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1286                    break innerBlock;
1287                } 
1288                
1289                // not lookahead...
1290                if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1291                    // This is a plain (non-look-ahead) accepting state.
1292                    if (!lookAheadHardBreak) {
1293                        // Clear out any pending look-ahead matches,
1294                        // but only if not doing the lookAheadHardBreak option
1295                        // which needs to force a break no matter what is going
1296                        // on with the rest of the match, i.e. we can't abandon
1297                        // a partially completed look-ahead match because
1298                        // some other rule matched further than the '/' position
1299                        // in the look-ahead match.
1300                        lookaheadStatus = 0; 
1301                    }
1302                }
1303                
1304            } // end of innerBlock.  "break innerBlock" in above code comes out here.
1305        
1306        
1307            if (state == STOP_STATE) {
1308                // Normal loop exit is here
1309                break mainLoop;
1310            }
1311        
1312            // then move iterator position backwards one character
1313            //
1314            if (mode == RBBI_RUN) {
1315                c = CIPrevious32(fText);
1316            } else {
1317                if (mode == RBBI_START) {
1318                    mode = RBBI_RUN;
1319                }
1320            }
1321        
1322        
1323        }   // End of the main loop.
1324        
1325        // The state machine is done.  Check whether it found a match...
1326        //
1327        // If the iterator failed to advance in the match engine, force it ahead by one.
1328        //   (This really indicates a defect in the break rules.  They should always match
1329        //    at least one character.)
1330        if (result == initialPosition) {
1331            result = fText.setIndex(initialPosition);
1332            CIPrevious32(fText);
1333            result = fText.getIndex();
1334        }
1335        
1336        fText.setIndex(result);
1337        if (fTrace) {
1338            System.out.println("Result = " + result);
1339        }
1340        
1341        return result;
1342    }
1343
1344
1345
1346
1347
1348    //-------------------------------------------------------------------------------
1349    
1350    //
1351    
1352    //  isDictionaryChar      Return true if the category lookup for this char
1353    
1354    //                        indicates that it is in the set of dictionary lookup
1355    
1356    //                        chars.
1357    
1358    //
1359    
1360    //                        This function is intended for use by dictionary based
1361    
1362    //                        break iterators.
1363    
1364    //
1365    
1366    //-------------------------------------------------------------------------------
1367    
1368    boolean isDictionaryChar(int c) {
1369    
1370        short  category = (short) fRData.fTrie.getCodePointValue(c);
1371    
1372        return (category & 0x4000) != 0;
1373    
1374    }
1375
1376}
1377//eof
1378
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags