KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > RuleBasedBreakIterator_New


1 /*
2  *******************************************************************************
3  * Copyright (C) 2006 International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7 package com.ibm.icu.text;
8
9 import java.io.IOException JavaDoc;
10 import java.io.InputStream JavaDoc;
11 import java.text.CharacterIterator JavaDoc;
12
13 import com.ibm.icu.impl.Assert;
14
15
16 /**
17  * Rule Based Break Iterator implementation.
18  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
19  *
20  * A note on future plans: Once a new DictionaryBasedBreakIterator implementation
21  * is completed, the archaic implementation class
22  * RuleBasedBreakIterator_Old can be completely removed,
23  * and this class can be renamed to be simply
24  * RuleBasedBreakIterator.
25  * @internal
26  */

27 public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
28     private static final boolean ASSERT = false;
29     
30     private static final int START_STATE = 1; // The state number of the starting state
31
private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
32

33     /** @internal */
34     RuleBasedBreakIterator_New() {
35     }
36
37     /**
38      * The character iterator through which this BreakIterator accesses the text
39      * @internal
40      */

41     private CharacterIterator JavaDoc fText;
42     
43     /**
44      * The rule data for this BreakIterator instance
45      * @internal
46      */

47     private RBBIDataWrapper fRData;
48
49     /** Index of the Rule {tag} values for the most recent match.
50      * @internal
51     */

52     private int fLastRuleStatusIndex;
53
54     /**
55      * Rule tag value valid flag.
56      * Some iterator operations don't intrinsically set the correct tag value.
57      * This flag lets us lazily compute the value if we are ever asked for it.
58      * @internal
59      */

60     private boolean fLastStatusIndexValid;
61     
62     /**
63      * Debugging flag. Trace operation of state machine when true.
64      * @internal
65      */

66     public static boolean fTrace;
67     
68     
69     /**
70      * Dump the contents of the state table and character classes for this break iterator.
71      * For debugging only.
72      * @internal
73      */

74     public void dump() {
75         this.fRData.dump();
76     }
77
78
79     //=======================================================================
80
// boilerplate
81
//=======================================================================
82
/**
83      * Clones this iterator.
84      * @return A newly-constructed RuleBasedBreakIterator with the same
85      * behavior as this one.
86      * @stable ICU 2.0
87      */

88     public Object JavaDoc clone()
89     {
90         RuleBasedBreakIterator_New result = (RuleBasedBreakIterator_New) super.clone();
91         if (fText != null) {
92             fText = (CharacterIterator JavaDoc)fText.clone();
93         }
94         return result;
95     }
96
97     /**
98      * Returns true if both BreakIterators are of the same class, have the same
99      * rules, and iterate over the same text.
100      * @stable ICU 2.0
101      */

102     public boolean equals(Object JavaDoc that) {
103         try {
104             RuleBasedBreakIterator_New other = (RuleBasedBreakIterator_New) that;
105             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
106                 return false;
107             }
108             if (fRData != null && other.fRData != null &&
109                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
110                 return false;
111             }
112             if (fText == null && other.fText == null) {
113                 return true;
114             }
115             if (fText == null || other.fText == null) {
116                 return false;
117             }
118             return fText.equals(other.fText);
119         }
120         catch(ClassCastException JavaDoc e) {
121             return false;
122         }
123      }
124
125     /**
126      * Returns the description (rules) used to create this iterator.
127      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
128      * @stable ICU 2.0
129      */

130     public String JavaDoc toString() {
131         String JavaDoc retStr = null;
132         if (fRData != null) {
133             retStr = fRData.fRuleSource;
134         }
135         return retStr;
136     }
137
138     /**
139      * Compute a hashcode for this BreakIterator
140      * @return A hash code
141      * @stable ICU 2.0
142      */

143     public int hashCode()
144     {
145         return fRData.fRuleSource.hashCode();
146     }
147
148     
149     //=======================================================================
150
// Constructors & Factories
151
//=======================================================================
152

153     /**
154      * Create a break iterator from a precompiled set of rules.
155      * @internal
156      */

157     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream JavaDoc is) throws IOException JavaDoc {
158         RuleBasedBreakIterator_New This = new RuleBasedBreakIterator_New();
159         This.fRData = RBBIDataWrapper.get(is);
160         This.fText = new java.text.StringCharacterIterator JavaDoc(""); // Note: some old tests fail if fText is null
161
// on a newly created instance.
162
return This;
163     }
164
165     
166     
167     //=======================================================================
168
// BreakIterator overrides
169
//=======================================================================
170

171     /**
172      * Sets the current iteration position to the beginning of the text.
173      * (i.e., the CharacterIterator's starting offset).
174      * @return The offset of the beginning of the text.
175      * @stable ICU 2.0
176      */

177     public int first() {
178         fLastRuleStatusIndex = 0;
179         fLastStatusIndexValid = true;
180         if (fText == null) {
181             return BreakIterator.DONE;
182         }
183         fText.first();
184         return fText.getIndex();
185     }
186     
187     
188     /**
189      * Sets the current iteration position to the end of the text.
190      * (i.e., the CharacterIterator's ending offset).
191      * @return The text's past-the-end offset.
192      * @stable ICU 2.0
193      */

194     public int last() {
195         if (fText == null) {
196             fLastRuleStatusIndex = 0;
197             fLastStatusIndexValid = true;
198             return BreakIterator.DONE;
199         }
200
201         // I'm not sure why, but t.last() returns the offset of the last character,
202
// rather than the past-the-end offset
203
//
204
// (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
205
// will work correctly.)
206

207
208         fLastStatusIndexValid = false;
209         int pos = fText.getEndIndex();
210         fText.setIndex(pos);
211         return pos;
212     }
213     
214     
215     /**
216      * Advances the iterator either forward or backward the specified number of steps.
217      * Negative values move backward, and positive values move forward. This is
218      * equivalent to repeatedly calling next() or previous().
219      * @param n The number of steps to move. The sign indicates the direction
220      * (negative is backwards, and positive is forwards).
221      * @return The character offset of the boundary position n boundaries away from
222      * the current one.
223      * @stable ICU 2.0
224      */

225     public int next(int n) {
226         int result = current();
227         while (n > 0) {
228             result = handleNext(fRData.fFTable);
229             --n;
230         }
231         while (n < 0) {
232             result = previous();
233             ++n;
234         }
235         return result;
236     }
237     
238     
239     /**
240      * Advances the iterator to the next boundary position.
241      * @return The position of the first boundary after this one.
242      * @stable ICU 2.0
243      */

244     public int next() {
245         return handleNext(fRData.fFTable);
246     }
247     
248     
249     /**
250      * Moves the iterator backwards, to the last boundary preceding this one.
251      * @return The position of the last boundary position preceding this one.
252      * @stable ICU 2.0
253      */

254     public int previous() {
255         // if we're already sitting at the beginning of the text, return DONE
256
if (fText == null || current() == fText.getBeginIndex()) {
257             fLastRuleStatusIndex = 0;
258             fLastStatusIndexValid = true;
259             return BreakIterator.DONE;
260         }
261
262         if (fRData.fSRTable != null || fRData.fSFTable != null) {
263             return handlePrevious(fRData.fRTable);
264         }
265
266         // old rule syntax
267
// set things up. handlePrevious() will back us up to some valid
268
// break position before the current position (we back our internal
269
// iterator up one step to prevent handlePrevious() from returning
270
// the current position), but not necessarily the last one before
271
// where we started
272

273         int start = current();
274
275         CIPrevious32(fText);
276         int lastResult = handlePrevious();
277         int result = lastResult;
278         int lastTag = 0;
279         boolean breakTagValid = false;
280
281         // iterate forward from the known break position until we pass our
282
// starting point. The last break position before the starting
283
// point is our return value
284

285         for (;;) {
286             result = handleNext(fRData.fFTable);
287             if (result == BreakIterator.DONE || result >= start) {
288                 break;
289             }
290             lastResult = result;
291             lastTag = fLastRuleStatusIndex;
292             breakTagValid = true;
293         }
294
295         // fLastBreakTag wants to have the value for section of text preceding
296
// the result position that we are to return (in lastResult.) If
297
// the backwards rules overshot and the above loop had to do two or more
298
// handleNext()s to move up to the desired return position, we will have a valid
299
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
300
// we wont have a tag value for that position, which is only set by handleNext().
301

302         // set the current iteration position to be the last break position
303
// before where we started, and then return that value
304
fText.setIndex(lastResult);
305         fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
306
fLastStatusIndexValid = breakTagValid;
307         return lastResult;
308     }
309     /**
310      * Sets the iterator to refer to the first boundary position following
311      * the specified position.
312      * @param offset The position from which to begin searching for a break position.
313      * @return The position of the first break after the current position.
314      * @stable ICU 2.0
315      */

316     public int following(int offset) {
317         // if the offset passed in is already past the end of the text,
318
// just return DONE; if it's before the beginning, return the
319
// text's starting offset
320
fLastRuleStatusIndex = 0;
321         fLastStatusIndexValid = true;
322         if (fText == null || offset >= fText.getEndIndex()) {
323             last();
324             return next();
325         }
326         else if (offset < fText.getBeginIndex()) {
327             return first();
328         }
329
330         // otherwise, set our internal iteration position (temporarily)
331
// to the position passed in. If this is the _beginning_ position,
332
// then we can just use next() to get our return value
333

334         int result = 0;
335
336         if (fRData.fSRTable != null) {
337             // Safe Point Reverse rules exist.
338
// This allows us to use the optimum algorithm.
339
fText.setIndex(offset);
340             // move forward one codepoint to prepare for moving back to a
341
// safe point.
342
// this handles offset being between a supplementary character
343
CINext32(fText);
344             // handlePrevious will move most of the time to < 1 boundary away
345
handlePrevious(fRData.fSRTable);
346             result = next();
347             while (result <= offset) {
348                 result = next();
349             }
350             return result;
351         }
352         if (fRData.fSFTable != null) {
353             // No Safe point reverse table, but there is a safe pt forward table.
354
//
355
fText.setIndex(offset);
356             CIPrevious32(fText);
357             // handle next will give result >= offset
358
handleNext(fRData.fSFTable);
359             // previous will give result 0 or 1 boundary away from offset,
360
// most of the time
361
// we have to
362
int oldresult = previous();
363             while (oldresult > offset) {
364                 result = previous();
365                 if (result <= offset) {
366                     return oldresult;
367                 }
368                 oldresult = result;
369             }
370             result = next();
371             if (result <= offset) {
372                 return next();
373             }
374             return result;
375         }
376         // otherwise, we have to sync up first. Use handlePrevious() to back
377
// us up to a known break position before the specified position (if
378
// we can determine that the specified position is a break position,
379
// we don't back up at all). This may or may not be the last break
380
// position at or before our starting position. Advance forward
381
// from here until we've passed the starting position. The position
382
// we stop on will be the first break position after the specified one.
383
// old rule syntax
384

385         fText.setIndex(offset);
386         if (offset == fText.getBeginIndex()) {
387             return handleNext(fRData.fFTable);
388         }
389         result = previous();
390
391         while (result != BreakIterator.DONE && result <= offset) {
392             result = next();
393         }
394
395         return result;
396     }
397     /**
398      * Sets the iterator to refer to the last boundary position before the
399      * specified position.
400      * @param offset The position to begin searching for a break from.
401      * @return The position of the last boundary before the starting position.
402      * @stable ICU 2.0
403      */

404     public int preceding(int offset) {
405         // if the offset passed in is already past the end of the text,
406
// just return DONE; if it's before the beginning, return the
407

408         // text's starting offset
409
if (fText == null || offset > fText.getEndIndex()) {
410             // return BreakIterator::DONE;
411
return last();
412         }
413         else if (offset < fText.getBeginIndex()) {
414             return first();
415         }
416
417         // if we start by updating the current iteration position to the
418
// position specified by the caller, we can just use previous()
419
// to carry out this operation
420

421         int result;
422         if (fRData.fSFTable != null) {
423             /// todo synwee
424
// new rule syntax
425
fText.setIndex(offset);
426             // move backwards one codepoint to prepare for moving forwards to a
427
// safe point.
428
// this handles offset being between a supplementary character
429
CIPrevious32(fText);
430             handleNext(fRData.fSFTable);
431             result = previous();
432             while (result >= offset) {
433                 result = previous();
434             }
435             return result;
436         }
437         if (fRData.fSRTable != null) {
438             // backup plan if forward safe table is not available
439
fText.setIndex(offset);
440             CINext32(fText);
441             // handle previous will give result <= offset
442
handlePrevious(fRData.fSRTable);
443
444             // next will give result 0 or 1 boundary away from offset,
445
// most of the time
446
// we have to
447
int oldresult = next();
448             while (oldresult < offset) {
449                 result = next();
450                 if (result >= offset) {
451                     return oldresult;
452                 }
453                 oldresult = result;
454             }
455             result = previous();
456             if (result >= offset) {
457                 return previous();
458             }
459             return result;
460         }
461
462         // old rule syntax
463
fText.setIndex(offset);
464         return previous();
465     }
466
467     /**
468      * Throw IllegalArgumentException unless begin <= offset < end.
469      * @stable ICU 2.0
470      */

471     protected static final void checkOffset(int offset, CharacterIterator JavaDoc text) {
472         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
473             throw new IllegalArgumentException JavaDoc("offset out of bounds");
474         }
475     }
476
477
478 /**
479  * Returns true if the specfied position is a boundary position. As a side
480  * effect, leaves the iterator pointing to the first boundary position at
481  * or after "offset".
482  * @param offset the offset to check.
483  * @return True if "offset" is a boundary position.
484  * @stable ICU 2.0
485  */

486 public boolean isBoundary(int offset) {
487     checkOffset(offset, fText);
488     
489     // the beginning index of the iterator is always a boundary position by definition
490
if (offset == fText.getBeginIndex()) {
491         first(); // For side effects on current position, tag values.
492
return true;
493     }
494
495     if (offset == fText.getEndIndex()) {
496         last(); // For side effects on current position, tag values.
497
return true;
498     }
499
500     // out-of-range indexes are never boundary positions
501
if (offset < fText.getBeginIndex()) {
502         first(); // For side effects on current position, tag values.
503
return false;
504     }
505
506     if (offset > fText.getEndIndex()) {
507         last(); // For side effects on current position, tag values.
508
return false;
509     }
510
511     // otherwise, we can use following() on the position before the specified
512
// one and return true if the position we get back is the one the user
513
// specified
514
return following(offset - 1) == offset;
515 }
516
517 /**
518  * Returns the current iteration position.
519  * @return The current iteration position.
520  * @stable ICU 2.0
521  */

522 public int current() {
523     return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
524     }
525
526
527
528 private void makeRuleStatusValid() {
529     if (fLastStatusIndexValid == false) {
530         // No cached status is available.
531
if (fText == null || current() == fText.getBeginIndex()) {
532             // At start of text, or there is no text. Status is always zero.
533
fLastRuleStatusIndex = 0;
534             fLastStatusIndexValid = true;
535         } else {
536             // Not at start of text. Find status the tedious way.
537
int pa = current();
538             previous();
539             int pb = next();
540             if (ASSERT) Assert.assrt("pa == pb", pa == pb);
541         }
542     }
543     if (ASSERT) {
544         Assert.assrt("fLastStatusIndexValid == true", fLastStatusIndexValid == true);
545         Assert.assrt("fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length",
546                         fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length);
547     }
548 }
549
550
551 /**
552  * Return the status tag from the break rule that determined the most recently
553  * returned break position. The values appear in the rule source
554  * within brackets, {123}, for example. For rules that do not specify a
555  * status, a default value of 0 is returned. If more than one rule applies,
556  * the numerically largest of the possible status values is returned.
557  * <p>
558  * Of the standard types of ICU break iterators, only the word break
559  * iterator provides status values. The values are defined in
560  * class RuleBasedBreakIterator, and allow distinguishing between words
561  * that contain alphabetic letters, "words" that appear to be numbers,
562  * punctuation and spaces, words containing ideographic characters, and
563  * more. Call <code>getRuleStatus</code> after obtaining a boundary
564  * position from <code>next()<code>, <code>previous()</code>, or
565  * any other break iterator functions that returns a boundary position.
566  * <p>
567  * @return the status from the break rule that determined the most recently
568  * returned break position.
569  *
570  * @draft ICU 3.0
571  * @provisional This API might change or be removed in a future release.
572  */

573
574 public int getRuleStatus() {
575     makeRuleStatusValid();
576     // Status records have this form:
577
// Count N <-- fLastRuleStatusIndex points here.
578
// Status val 0
579
// Status val 1
580
// ...
581
// Status val N-1 <-- the value we need to return
582
// The status values are sorted in ascending order.
583
// This function returns the last (largest) of the array of status values.
584
int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
585     int tagVal = fRData.fStatusTable[idx];
586
587     return tagVal;
588 }
589
590
591
592 /**
593  * Get the status (tag) values from the break rule(s) that determined the most
594  * recently returned break position. The values appear in the rule source
595  * within brackets, {123}, for example. The default status value for rules
596  * that do not explicitly provide one is zero.
597  * <p>
598  * The status values used by the standard ICU break rules are defined
599  * as public constants in class RuleBasedBreakIterator.
600  * <p>
601  * If the size of the output array is insufficient to hold the data,
602  * the output will be truncated to the available length. No exception
603  * will be thrown.
604  *
605  * @param fillInArray an array to be filled in with the status values.
606  * @return The number of rule status values from rules that determined
607  * the most recent boundary returned by the break iterator.
608  * In the event that the array is too small, the return value
609  * is the total number of status values that were available,
610  * not the reduced number that were actually returned.
611  * @draft ICU 3.0
612  * @provisional This API might change or be removed in a future release.
613  */

614 public int getRuleStatusVec(int[] fillInArray) {
615     makeRuleStatusValid();
616     int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
617     if (fillInArray != null) {
618         int numToCopy = Math.min(numStatusVals, fillInArray.length);
619         for (int i=0; i<numToCopy; i++) {
620             fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
621         }
622     }
623     return numStatusVals;
624  }
625
626
627 /**
628  * Return a CharacterIterator over the text being analyzed. This version
629  * of this method returns the actual CharacterIterator we're using internally.
630  * Changing the state of this iterator can have undefined consequences. If
631  * you need to change it, clone it first.
632  * @return An iterator over the text being analyzed.
633  * @stable ICU 2.0
634  */

635     public CharacterIterator JavaDoc getText() {
636         return fText;
637     }
638
639
640     /**
641      * Set the iterator to analyze a new piece of text. This function resets
642      * the current iteration position to the beginning of the text.
643      * @param newText An iterator over the text to analyze.
644      * @stable ICU 2.0
645      */

646     public void setText(CharacterIterator JavaDoc newText) {
647         fText = newText;
648         this.first();
649     }
650     
651     // 23 bit Char value returned from when an iterator has run out of range.
652
// Positive value so fast case (not end, not surrogate) can be checked
653
// with a single test.
654
private static int CI_DONE32 = 0x7fffffff;
655     
656     /**
657      * Move the iterator forward to the next code point, and return that code point,
658      * leaving the iterator positioned at char returned.
659      * For Supplementary chars, the iterator is left positioned at the lead surrogate.
660      * @param ci The character iterator
661      * @return The next code point.
662      */

663     private static int CINext32(CharacterIterator JavaDoc ci) {
664         // If the current position is at a surrogate pair, move to the trail surrogate
665
// which leaves it in positon for underlying iterator's next() to work.
666
int c= ci.current();
667         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) {
668             c = ci.next();
669             if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) {
670                c = ci.previous();
671             }
672         }
673
674         // For BMP chars, this next() is the real deal.
675
c = ci.next();
676         
677         // If we might have a lead surrogate, we need to peak ahead to get the trail
678
// even though we don't want to really be positioned there.
679
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
680             c = CINextTrail32(ci, c);
681         }
682         
683         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
684             // We got a supplementary char. Back the iterator up to the postion
685
// of the lead surrogate.
686
ci.previous();
687         }
688         return c;
689    }
690
691     
692     // Out-of-line portion of the in-line Next32 code.
693
// The call site does an initial ci.next() and calls this function
694
// if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
695
// NOTE: we leave the underlying char iterator positioned in the
696
// middle of a surroage pair. ci.next() will work correctly
697
// from there, but the ci.getIndex() will be wrong, and needs
698
// adjustment.
699
private static int CINextTrail32(CharacterIterator JavaDoc ci, int lead) {
700         int retVal = lead;
701         if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
702             char cTrail = ci.next();
703             if (UTF16.isTrailSurrogate(cTrail)) {
704                 retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
705                             (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
706                             UTF16.SUPPLEMENTARY_MIN_VALUE;
707             } else {
708                 ci.previous();
709             }
710         } else {
711             if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
712                 retVal = CI_DONE32;
713             }
714         }
715         return retVal;
716     }
717        
718     private static int CIPrevious32(CharacterIterator JavaDoc ci) {
719         if (ci.getIndex() <= ci.getBeginIndex()) {
720             return CI_DONE32;
721         }
722         char trail = ci.previous();
723         int retVal = trail;
724         if (UTF16.isTrailSurrogate(trail)) {
725             char lead = ci.previous();
726             if (UTF16.isLeadSurrogate(lead)) {
727                 retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
728                           ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
729                           UTF16.SUPPLEMENTARY_MIN_VALUE;
730             } else {
731                 ci.next();
732             }
733         }
734         return retVal;
735     }
736     
737
738     
739     private static int CICurrent32(CharacterIterator JavaDoc ci) {
740         char lead = ci.current();
741         int retVal = lead;
742         if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
743             return retVal;
744         }
745         if (UTF16.isLeadSurrogate(lead)) {
746             int trail = (int)ci.next();
747             ci.previous();
748             if (UTF16.isTrailSurrogate((char)trail)) {
749                 retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
750                          (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
751                          UTF16.SUPPLEMENTARY_MIN_VALUE;
752             }
753          } else {
754             if (lead == CharacterIterator.DONE) {
755                 if (ci.getIndex() >= ci.getEndIndex()) {
756                     retVal = CI_DONE32;
757                 }
758             }
759          }
760         return retVal;
761     }
762     
763
764     /**
765      * The State Machine Engine for moving forward is here.
766      * @param stateTable
767      * @return the new iterator position
768      *
769      * A note on supplementary characters and the position of underlying
770      * Java CharacterIterator: Normally, a character iterator is positioned at
771      * the char most recently returned by next(). Within this function, when
772      * a supplementary char is being processed, the char iterator is left
773      * sitting on the trail surrogate, in the middle of the code point.
774      * This is different from everywhere else, where an iterator always
775      * points at the lead surrogate of a supplementary.
776      */

777     private int handleNext(short stateTable[]) {
778         if (fTrace) {
779             System.out.println("Handle Next pos char state category");
780         }
781
782         // No matter what, handleNext alway correctly sets the break tag value.
783
fLastStatusIndexValid = true;
784
785         // if we're already at the end of the text, return DONE.
786
if (fText == null) {
787             fLastRuleStatusIndex = 0;
788             return BreakIterator.DONE;
789         }
790
791         int initialPosition = fText.getIndex();
792         int result = initialPosition;
793         int lookaheadResult = 0;
794
795         // Initialize the state machine. Begin in state 1
796
int state = START_STATE;
797         short category;
798         int c = fText.current();
799         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
800             c = CINextTrail32(fText, c);
801             if (c == CI_DONE32) {
802                 fLastRuleStatusIndex = 0;
803                 return BreakIterator.DONE;
804             }
805         }
806         int row = fRData.getRowIndex(state);
807         int lookaheadStatus = 0;
808         int lookaheadTagIdx = 0;
809
810         fLastRuleStatusIndex = 0;
811
812         // Character Category fetch for starting character.
813
// See comments on character category code within loop, below.
814
category = (short)fRData.fTrie.getCodePointValue(c);
815         //if ((category & 0x4000) != 0) {
816
// fDictionaryCharCount++;
817
// category &= ~0x4000;
818
// }
819

820         // loop until we reach the end of the text or transition to state 0
821
while (state != STOP_STATE) {
822             if (c == CI_DONE32) {
823                 // Reached end of input string.
824

825                 if (lookaheadResult > result) {
826                     // We ran off the end of the string with a pending look-ahead match.
827
// Treat this as if the look-ahead condition had been met, and return
828
// the match at the / position from the look-ahead rule.
829
result = lookaheadResult;
830                     fLastRuleStatusIndex = lookaheadTagIdx;
831                     lookaheadStatus = 0;
832                 } else if (result == initialPosition) {
833                     // Ran off end, no match found.
834
// move forward one
835
fText.setIndex(initialPosition);
836                     CINext32(fText);
837                 }
838                 break;
839             }
840             // look up the current character's character category, which tells us
841
// which column in the state table to look at.
842
//
843
category = (short)fRData.fTrie.getCodePointValue(c);
844
845             // Clear the dictionary flag bit in the character's category.
846
// Note: not using the old style dictionary stuff in this Java engine.
847
// But the bit can be set by the C++ rule compiler, and
848
// we need to clear it out here to be safe.
849
//category &= ~0x4000; // TODO: commented out for perf.
850

851             if (fTrace) {
852                 System.out.print(" " + RBBIDataWrapper.intToString(fText.getIndex(), 5));
853                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
854                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
855             }
856
857             // look up a state transition in the state table
858
// state = row->fNextState[category];
859
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
860             row = fRData.getRowIndex(state);
861
862             // Get the next character. Doing it here positions the iterator
863
// to the correct position for recording matches in the code that
864
// follows.
865
c = (int)fText.next();
866             if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
867                 c = CINextTrail32(fText, c);
868             }
869
870             if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
871                 // Match found, common case, could have lookahead so we move on to check it
872
result = fText.getIndex();
873                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
874                     // The iterator has been left in the middle of a surrogate pair.
875
// We want the start of it.
876
result--;
877                 }
878
879                 // Remember the break status (tag) values.
880
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
881             }
882
883             if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
884                 if (lookaheadStatus != 0
885                     && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
886                     // Lookahead match is completed. Set the result accordingly, but only
887
// if no other rule has matched further in the mean time.
888
result = lookaheadResult;
889                     fLastRuleStatusIndex = lookaheadTagIdx;
890                     lookaheadStatus = 0;
891                     continue;
892                 }
893
894                 lookaheadResult = fText.getIndex();
895                 if (c>=UTF16.SUPPLEMENTARY_MIN_VALUE && c!=CI_DONE32) {
896                     // The iterator has been left in the middle of a surrogate pair.
897
// We want the beginning of it.
898
lookaheadResult--;
899                 }
900                 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
901                 lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
902                 continue;
903             }
904
905
906             if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
907                 lookaheadStatus = 0; // clear out any pending look-ahead matches.
908
}
909         } // End of state machine main loop
910

911         // The state machine is done. Check whether it found a match...
912

913         // If the iterator failed to advance in the match engine, force it ahead by one.
914
// (This really indicates a defect in the break rules. They should always match
915
// at least one character.)
916
if (result == initialPosition) {
917             result = fText.setIndex(initialPosition);
918             CINext32(fText);
919             result = fText.getIndex();
920         }
921
922         // Leave the iterator at our result position.
923
fText.setIndex(result);
924         if (fTrace) {
925             System.out.println("result = " + result);
926         }
927         return result;
928     }
929
930     /*
931      * handlePrevious
932      */

933     private int handlePrevious() {
934         if (fText == null || fRData == null) {
935             return 0;
936         }
937         if (fRData.fRTable == null) {
938             fText.first();
939             return fText.getIndex();
940         }
941
942         short stateTable[] = fRData.fRTable;
943         int state = START_STATE;
944         int category;
945         int lastCategory = 0;
946         int result = fText.getIndex();
947         int lookaheadStatus = 0;
948         int lookaheadResult = 0;
949         int lookaheadTagIdx = 0;
950         int c = CICurrent32(fText);
951         int row;
952
953         row = fRData.getRowIndex(state);
954         category = (short)fRData.fTrie.getCodePointValue(c);
955         //category &= ~0x4000; // Clear the dictionary bit, just in case.
956

957         if (fTrace) {
958             System.out.println("Handle Prev pos char state category ");
959         }
960
961         // loop until we reach the beginning of the text or transition to state 0
962
for (;;) {
963             if (c == CI_DONE32) {
964                 break;
965             }
966
967             // save the last character's category and look up the current
968
// character's category
969
lastCategory = category;
970             category = (short)fRData.fTrie.getCodePointValue(c);
971
972             // Check the dictionary bit in the character's category.
973
// Don't exist in this Java engine implementation. Clear the bit.
974
//
975
// category &= ~0x4000;
976

977             if (fTrace) {
978                 System.out.print(" " + fText.getIndex()+ " ");
979                 if (0x20<=c && c<0x7f) {
980                     System.out.print(" " + c + " ");
981                 } else {
982                     System.out.print(" " + Integer.toHexString(c) + " ");
983                 }
984                 System.out.println(" " + state + " " + category + " ");
985             }
986
987             // look up a state transition in the backwards state table
988
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
989             row = fRData.getRowIndex(state);
990
991             continueOn: {
992                 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0 &&
993                         stateTable[row + RBBIDataWrapper.LOOKAHEAD] == 0) {
994                     break continueOn;
995                 }
996                 
997                 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
998                     // Match found, common case, no lookahead involved.
999
result = fText.getIndex();
1000                    lookaheadStatus = 0; // clear out any pending look-ahead matches.
1001
break continueOn;
1002                }
1003                
1004                if (stateTable[row + RBBIDataWrapper.ACCEPTING] == 0 &&
1005                    stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1006                    // Lookahead match point. Remember it, but only if no other rule
1007
// has unconditionally matched to this point.
1008
// TODO: handle case where there's a pending match from a different rule
1009
// where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
1010
int r = fText.getIndex();
1011                    if (r > result) {
1012                        lookaheadResult = r;
1013                        lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1014                        lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
1015                    }
1016                    break continueOn;
1017                }
1018                
1019                if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0 &&
1020                        stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1021                    // Lookahead match is completed. Set the result accordingly, but only
1022
// if no other rule has matched further in the mean time.
1023
// TODO: CHECK THIS LOGIC. It looks backwards.
1024
// These are _reverse_ rules.
1025
if (lookaheadResult > result) {
1026                        if (stateTable[row + RBBIDataWrapper.ACCEPTING] != lookaheadStatus) {
1027                            // TODO: handle this case of overlapping lookahead matches.
1028
// With correctly written rules, we won't get here.
1029
// System.out.println("Trouble in handlePrevious()");
1030
}
1031                        result = lookaheadResult;
1032                        fLastRuleStatusIndex = lookaheadTagIdx;
1033                        lookaheadStatus = 0;
1034                    }
1035                    break continueOn;
1036                }
1037            } // end of continueOn block.
1038

1039            if (state == STOP_STATE) {
1040                break;
1041            }
1042
1043            // then move one character backwards
1044
c = CIPrevious32(fText);
1045       }
1046
1047        // Note: the result position isn't what is returned to the user by previous(),
1048
// but where the implementation of previous() turns around and
1049
// starts iterating forward again.
1050
if (c == CI_DONE32) {
1051            result = fText.getBeginIndex();
1052        }
1053        fText.setIndex(result);
1054
1055        return result;
1056    }
1057    
1058    
1059    private int handlePrevious(short stateTable[]) {
1060        if (fText == null || stateTable == null) {
1061            return 0;
1062        }
1063        // break tag is no longer valid after icu switched to exact backwards
1064
// positioning.
1065
fLastStatusIndexValid = false;
1066        if (stateTable == null) {
1067            return fText.getBeginIndex();
1068        }
1069
1070        int state = START_STATE;
1071        int category;
1072        int c = CIPrevious32(fText);
1073        // previous character
1074
int result = fText.getIndex();
1075        int lookaheadStatus = 0;
1076        int lookaheadResult = 0;
1077        boolean lookAheadHardBreak =
1078            (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1079  
1080        int row = fRData.getRowIndex(state);
1081
1082        category = (short)fRData.fTrie.getCodePointValue(c);
1083
1084        if (fTrace) {
1085            System.out.println("Handle Prev pos char state category ");
1086        }
1087        
1088        // loop until we reach the beginning of the text or transition to state 0
1089
for (;;) {
1090            if (c==CI_DONE32) {
1091                if (fRData.fHeader.fVersion == 1) {
1092                    // This is the old (ICU 3.2 and earlier) format data.
1093
// No explicit support for matching {eof}. Did have hacke, though...
1094
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0 &&
1095                            lookaheadResult == 0) {
1096                        result = 0;
1097                    }
1098                    break;
1099                }
1100                // Newer data format, with support for {eof}.
1101
// end of input is hardwired by rule builder as category 1
1102
category = 1;
1103            } else {
1104                // not at {eof}
1105
// look up the current character's category (the table column)
1106
category = (short)fRData.fTrie.getCodePointValue(c);
1107            }
1108
1109            // category &= ~0x4000; // Clear the dictionary bit flag
1110
// // (Should be unused; holdover from old RBBI)
1111

1112            if (fTrace) {
1113                System.out.print(" " + fText.getIndex()+ " ");
1114                if (0x20<=c && c<0x7f) {
1115                    System.out.print(" " + c + " ");
1116                } else {
1117                    System.out.print(" " + Integer.toHexString(c) + " ");
1118                }
1119                System.out.println(" " + state + " " + category + " ");
1120            }
1121
1122            // look up a state transition in the backwards state table
1123
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1124            row = fRData.getRowIndex(state);
1125
1126            if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1127                // Match found, common case, could have lookahead so we move on to check it
1128
result = fText.getIndex();
1129            }
1130
1131            if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1132                if (lookaheadStatus != 0
1133                    && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1134                    // Lookahead match is completed. Set the result accordingly, but only
1135
// if no other rule has matched further in the mean time.
1136
result = lookaheadResult;
1137                    lookaheadStatus = 0;
1138                    /// i think we have to back up to read the lookahead character again
1139
/// fText->setIndex(lookaheadResult);
1140
/// TODO: this is a simple hack since reverse rules only have simple
1141
/// lookahead rules that we can definitely break out from.
1142
/// we need to make the lookahead rules not chain eventually.
1143
/// return result;
1144
/// this is going to be the longest match again
1145

1146                    /// syn wee todo hard coded for line breaks stuff
1147
/// needs to provide a tag in rules to ensure a stop.
1148

1149                    if (lookAheadHardBreak) {
1150                        break;
1151                    }
1152                    fText.setIndex(result);
1153                } else {
1154                    // Hit a possible look-ahead match. We are at the
1155
// position of the '/'. Remember this position.
1156
lookaheadResult = fText.getIndex();
1157                    lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1158                }
1159            } else {
1160                // not lookahead...
1161
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1162                    // This is a plain (non-look-ahead) acceptiong state.
1163
if (!lookAheadHardBreak) {
1164                        lookaheadStatus = 0; // Clear out any pending look-ahead matches,
1165
// but only if not doing the lookAheadHardBreak option
1166
// which needs to force a break no matter what is going
1167
// on with the rest of the match, i.e. we can't abandon
1168
// a partially completed look-ahead match because some
1169
// other rule matched further than the '/' position
1170
// in the look-ahead match.
1171
}
1172                }
1173            }
1174            
1175            if (state == STOP_STATE) {
1176                break;
1177            }
1178
1179            // then move iterator position backwards one character
1180
c = CIPrevious32(fText);
1181        }
1182
1183        fText.setIndex(result);
1184
1185        return result;
1186    }
1187
1188}
1189
1190
1191
1192
1193
1194
Popular Tags