1 8 package com.ibm.icu.text; 9 10 import java.text.CharacterIterator ; 11 import java.io.IOException ; 12 import java.io.InputStream ; 13 import java.io.OutputStream ; 14 import java.io.ByteArrayInputStream ; 15 import java.io.ByteArrayOutputStream ; 16 17 import com.ibm.icu.impl.Assert; 18 19 20 26 public class RuleBasedBreakIterator extends BreakIterator { 27 28 29 33 37 public RuleBasedBreakIterator() { 38 } 39 40 45 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 46 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 47 This.fRData = RBBIDataWrapper.get(is); 48 return This; 49 } 50 51 private RuleBasedBreakIterator(RuleBasedBreakIterator other) { 52 fRData = other.fRData; 54 if (fText != null) { 55 fText = (CharacterIterator )(other.fText.clone()); 56 } 57 } 58 59 67 public RuleBasedBreakIterator(String rules) { 68 init(); 69 try { 70 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream (); 71 compileRules(rules, ruleOS); 72 byte [] ruleBA = ruleOS.toByteArray(); 73 InputStream ruleIS = new ByteArrayInputStream (ruleBA); 74 fRData = RBBIDataWrapper.get(ruleIS); 75 } catch (IOException e) { 76 RuntimeException rte = new RuntimeException ("RuleBasedBreakIterator rule compilation internal error:"); 80 throw rte; 84 } 85 } 86 87 88 92 98 public Object clone() 99 { 100 RuleBasedBreakIterator result = new RuleBasedBreakIterator(this); 101 return result; 102 } 103 104 109 public boolean equals(Object that) { 110 try { 111 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 112 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 113 return false; 114 } 115 if (fRData != null && other.fRData != null && 116 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 117 return false; 118 } 119 if (fText == null && other.fText == null) { 120 return true; 121 } 122 if (fText == null || other.fText == null) { 123 return false; 124 } 125 return fText.equals(other.fText); 126 } 127 catch(ClassCastException e) { 128 return false; 129 } 130 } 131 132 137 public String toString() { 138 String retStr = null; 139 if (fRData != null) { 140 retStr = fRData.fRuleSource; 141 } 142 return retStr; 143 } 144 145 150 public int hashCode() 151 { 152 return fRData.fRuleSource.hashCode(); 153 } 154 155 156 162 public static final int WORD_NONE = 0; 163 164 169 public static final int WORD_NONE_LIMIT = 100; 170 171 176 public static final int WORD_NUMBER = 100; 177 178 183 public static final int WORD_NUMBER_LIMIT = 200; 184 185 191 public static final int WORD_LETTER = 200; 192 193 198 public static final int WORD_LETTER_LIMIT = 300; 199 200 205 public static final int WORD_KANA = 300; 206 207 212 public static final int WORD_KANA_LIMIT = 400; 213 214 219 public static final int WORD_IDEO = 400; 220 221 226 public static final int WORD_IDEO_LIMIT = 500; 227 228 229 230 231 private static final int START_STATE = 1; private static final int STOP_STATE = 0; 234 private static final int RBBI_START = 0; 238 private static final int RBBI_RUN = 1; 239 private static final int RBBI_END = 2; 240 241 247 private CharacterIterator fText = new java.text.StringCharacterIterator (""); 248 249 254 protected RBBIDataWrapper fRData; 255 256 260 private int fLastRuleStatusIndex; 261 262 269 private boolean fLastStatusIndexValid; 270 271 279 protected int fDictionaryCharCount; 280 281 286 public static boolean fTrace; 287 288 289 295 public void dump() { 296 this.fRData.dump(); 297 } 298 299 private static boolean debugInitDone = false; 300 301 private void init() { 302 fLastStatusIndexValid = true; 303 fDictionaryCharCount = 0; 304 305 306 if (debugInitDone == false) { 307 String debugEnv = System.getProperty("U_RBBIDEBUG"); 308 if (debugEnv!=null && debugEnv.indexOf("trace")>=0) { 309 fTrace = true; 310 } 311 debugInitDone = true; 312 } 313 } 314 315 private static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 316 RBBIRuleBuilder.compileRules(rules, ruleBinary); 317 } 318 319 323 329 public int first() { 330 fLastRuleStatusIndex = 0; 331 fLastStatusIndexValid = true; 332 if (fText == null) { 333 return BreakIterator.DONE; 334 } 335 fText.first(); 336 return fText.getIndex(); 337 } 338 339 340 346 public int last() { 347 if (fText == null) { 348 fLastRuleStatusIndex = 0; 349 fLastStatusIndexValid = true; 350 return BreakIterator.DONE; 351 } 352 353 359 360 fLastStatusIndexValid = false; 361 int pos = fText.getEndIndex(); 362 fText.setIndex(pos); 363 return pos; 364 } 365 366 367 377 public int next(int n) { 378 int result = current(); 379 while (n > 0) { 380 result = handleNext(); 381 --n; 382 } 383 while (n < 0) { 384 result = previous(); 385 ++n; 386 } 387 return result; 388 } 389 390 391 396 public int next() { 397 return handleNext(); 398 } 399 400 401 406 public int previous() { 407 if (fText == null || current() == fText.getBeginIndex()) { 409 fLastRuleStatusIndex = 0; 410 fLastStatusIndexValid = true; 411 return BreakIterator.DONE; 412 } 413 414 if (fRData.fSRTable != null || fRData.fSFTable != null) { 415 return handlePrevious(fRData.fRTable); 416 } 417 418 425 int start = current(); 426 427 CIPrevious32(fText); 428 int lastResult = handlePrevious(fRData.fRTable); 429 if (lastResult == BreakIterator.DONE) { 430 lastResult = fText.getBeginIndex(); 431 fText.setIndex(lastResult); 432 } 433 int result = lastResult; 434 int lastTag = 0; 435 boolean breakTagValid = false; 436 437 441 for (;;) { 442 result = handleNext(); 443 if (result == BreakIterator.DONE || result >= start) { 444 break; 445 } 446 lastResult = result; 447 lastTag = fLastRuleStatusIndex; 448 breakTagValid = true; 449 } 450 451 458 fText.setIndex(lastResult); 461 fLastRuleStatusIndex = lastTag; fLastStatusIndexValid = breakTagValid; 463 return lastResult; 464 } 465 472 public int following(int offset) { 473 fLastRuleStatusIndex = 0; 477 fLastStatusIndexValid = true; 478 if (fText == null || offset >= fText.getEndIndex()) { 479 last(); 480 return next(); 481 } 482 else if (offset < fText.getBeginIndex()) { 483 return first(); 484 } 485 486 490 int result = 0; 491 492 if (fRData.fSRTable != null) { 493 fText.setIndex(offset); 496 CINext32(fText); 500 handlePrevious(fRData.fSRTable); 502 result = next(); 503 while (result <= offset) { 504 result = next(); 505 } 506 return result; 507 } 508 if (fRData.fSFTable != null) { 509 fText.setIndex(offset); 512 CIPrevious32(fText); 513 handleNext(fRData.fSFTable); 515 int oldresult = previous(); 519 while (oldresult > offset) { 520 result = previous(); 521 if (result <= offset) { 522 return oldresult; 523 } 524 oldresult = result; 525 } 526 result = next(); 527 if (result <= offset) { 528 return next(); 529 } 530 return result; 531 } 532 541 fText.setIndex(offset); 542 if (offset == fText.getBeginIndex()) { 543 return handleNext(); 544 } 545 result = previous(); 546 547 while (result != BreakIterator.DONE && result <= offset) { 548 result = next(); 549 } 550 551 return result; 552 } 553 560 public int preceding(int offset) { 561 564 if (fText == null || offset > fText.getEndIndex()) { 566 return last(); 568 } 569 else if (offset < fText.getBeginIndex()) { 570 return first(); 571 } 572 573 577 int result; 578 if (fRData.fSFTable != null) { 579 fText.setIndex(offset); 582 CIPrevious32(fText); 586 handleNext(fRData.fSFTable); 587 result = previous(); 588 while (result >= offset) { 589 result = previous(); 590 } 591 return result; 592 } 593 if (fRData.fSRTable != null) { 594 fText.setIndex(offset); 596 CINext32(fText); 597 handlePrevious(fRData.fSRTable); 599 600 int oldresult = next(); 604 while (oldresult < offset) { 605 result = next(); 606 if (result >= offset) { 607 return oldresult; 608 } 609 oldresult = result; 610 } 611 result = previous(); 612 if (result >= offset) { 613 return previous(); 614 } 615 return result; 616 } 617 618 fText.setIndex(offset); 620 return previous(); 621 } 622 623 627 protected static final void checkOffset(int offset, CharacterIterator text) { 628 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 629 throw new IllegalArgumentException ("offset out of bounds"); 630 } 631 } 632 633 634 642 public boolean isBoundary(int offset) { 643 checkOffset(offset, fText); 644 645 if (offset == fText.getBeginIndex()) { 647 first(); return true; 649 } 650 651 if (offset == fText.getEndIndex()) { 652 last(); return true; 654 } 655 656 660 fText.setIndex(offset); 664 CIPrevious32(fText); 665 int pos = fText.getIndex(); 666 boolean result = following(pos) == offset; 667 return result; 668 } 669 670 675 public int current() { 676 return (fText != null) ? fText.getIndex() : BreakIterator.DONE; 677 } 678 679 680 681 private void makeRuleStatusValid() { 682 if (fLastStatusIndexValid == false) { 683 if (fText == null || current() == fText.getBeginIndex()) { 685 fLastRuleStatusIndex = 0; 687 fLastStatusIndexValid = true; 688 } else { 689 int pa = current(); 691 previous(); 692 int pb = next(); 693 Assert.assrt (pa == pb); 694 } 695 Assert.assrt(fLastStatusIndexValid == true); 696 Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length); 697 } 698 } 699 700 701 723 724 public int getRuleStatus() { 725 makeRuleStatusValid(); 726 int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex]; 735 int tagVal = fRData.fStatusTable[idx]; 736 737 return tagVal; 738 } 739 740 741 742 764 public int getRuleStatusVec(int[] fillInArray) { 765 makeRuleStatusValid(); 766 int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex]; 767 if (fillInArray != null) { 768 int numToCopy = Math.min(numStatusVals, fillInArray.length); 769 for (int i=0; i<numToCopy; i++) { 770 fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1]; 771 } 772 } 773 return numStatusVals; 774 } 775 776 777 785 public CharacterIterator getText() { 786 return fText; 787 } 788 789 790 796 public void setText(CharacterIterator newText) { 797 fText = newText; 798 this.first(); 799 } 800 801 806 protected static String fDebugEnv = System.getProperty("U_RBBIDEBUG"); 807 808 809 private static int CI_DONE32 = 0x7fffffff; 813 814 821 static int CINext32(CharacterIterator ci) { 822 int c= ci.current(); 825 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) { 826 c = ci.next(); 827 if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) { 828 c = ci.previous(); 829 } 830 } 831 832 c = ci.next(); 834 835 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 838 c = CINextTrail32(ci, c); 839 } 840 841 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) { 842 ci.previous(); 845 } 846 return c; 847 } 848 849 850 private static int CINextTrail32(CharacterIterator ci, int lead) { 858 int retVal = lead; 859 if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 860 char cTrail = ci.next(); 861 if (UTF16.isTrailSurrogate(cTrail)) { 862 retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) + 863 (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) + 864 UTF16.SUPPLEMENTARY_MIN_VALUE; 865 } else { 866 ci.previous(); 867 } 868 } else { 869 if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) { 870 retVal = CI_DONE32; 871 } 872 } 873 return retVal; 874 } 875 876 private static int CIPrevious32(CharacterIterator ci) { 877 if (ci.getIndex() <= ci.getBeginIndex()) { 878 return CI_DONE32; 879 } 880 char trail = ci.previous(); 881 int retVal = trail; 882 if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) { 883 char lead = ci.previous(); 884 if (UTF16.isLeadSurrogate(lead)) { 885 retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) + 886 ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) + 887 UTF16.SUPPLEMENTARY_MIN_VALUE; 888 } else { 889 ci.next(); 890 } 891 } 892 return retVal; 893 } 894 895 static int CICurrent32(CharacterIterator ci) { 896 char lead = ci.current(); 897 int retVal = lead; 898 if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) { 899 return retVal; 900 } 901 if (UTF16.isLeadSurrogate(lead)) { 902 int trail = (int)ci.next(); 903 ci.previous(); 904 if (UTF16.isTrailSurrogate((char)trail)) { 905 retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) + 906 (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) + 907 UTF16.SUPPLEMENTARY_MIN_VALUE; 908 } 909 } else { 910 if (lead == CharacterIterator.DONE) { 911 if (ci.getIndex() >= ci.getEndIndex()) { 912 retVal = CI_DONE32; 913 } 914 } 915 } 916 return retVal; 917 } 918 919 920 int handleNext() { 931 return handleNext(fRData.fFTable); 932 } 933 934 949 private int handleNext(short stateTable[]) { 950 int state; 951 short category = 0; 952 int mode; 953 int row; 954 int c; 955 int lookaheadStatus = 0; 956 int lookaheadTagIdx = 0; 957 int result = 0; 958 int initialPosition = 0; 959 int lookaheadResult = 0; 960 boolean lookAheadHardBreak = 961 (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0; 962 963 if (fTrace) { 964 System.out.println("Handle Next pos char state category"); 965 } 966 967 fLastStatusIndexValid = true; 969 fLastRuleStatusIndex = 0; 970 971 if (fText == null) { 973 fLastRuleStatusIndex = 0; 974 return BreakIterator.DONE; 975 } 976 977 initialPosition = fText.getIndex(); 979 result = initialPosition; 980 c = fText.current(); 981 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 982 c = CINextTrail32(fText, c); 983 if (c == CI_DONE32) { 984 fLastRuleStatusIndex = 0; 985 return BreakIterator.DONE; 986 } 987 } 988 989 state = START_STATE; 991 row = fRData.getRowIndex(state); 992 category = 3; 993 mode = RBBI_RUN; 994 if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 995 category = 2; 996 mode = RBBI_START; 997 } 998 999 1000 while (state != STOP_STATE) { 1002 if (c == CI_DONE32) { 1003 if (mode == RBBI_END) { 1005 1009 if (lookaheadResult > result) { 1010 result = lookaheadResult; 1016 fLastRuleStatusIndex = lookaheadTagIdx; 1017 lookaheadStatus = 0; 1018 } else if (result == initialPosition) { 1019 fText.setIndex(initialPosition); 1022 CINext32(fText); 1023 } 1024 break; 1025 } 1026 mode = RBBI_END; 1028 category = 1; 1029 } 1030 1031 if (mode == RBBI_RUN) { 1036 category = (short) fRData.fTrie.getCodePointValue(c); 1040 1041 if ((category & 0x4000) != 0) { 1047 fDictionaryCharCount++; 1048 category &= ~0x4000; 1050 } 1051 } 1052 1053 if (fTrace) { 1054 System.out.print(" " + RBBIDataWrapper.intToString(fText.getIndex(), 5)); 1055 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 1056 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 1057 } 1058 1059 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1062 row = fRData.getRowIndex(state); 1063 1064 if (mode == RBBI_RUN) { 1068 c = (int)fText.next(); 1069 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 1070 c = CINextTrail32(fText, c); 1071 } 1072 } else { 1073 if (mode == RBBI_START) { 1074 mode = RBBI_RUN; 1075 } 1076 } 1077 1078 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { 1079 result = fText.getIndex(); 1081 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) { 1082 result--; 1085 } 1086 1087 fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; 1089 } 1090 1091 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) { 1092 if (lookaheadStatus != 0 1093 && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) { 1094 result = lookaheadResult; 1097 fLastRuleStatusIndex = lookaheadTagIdx; 1098 lookaheadStatus = 0; 1099 if (lookAheadHardBreak) { 1101 return result; 1102 } 1103 continue; 1106 } 1107 1108 lookaheadResult = fText.getIndex(); 1109 if (c>=UTF16.SUPPLEMENTARY_MIN_VALUE && c!=CI_DONE32) { 1110 lookaheadResult--; 1113 } 1114 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 1115 lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX]; 1116 continue; 1117 } 1118 1119 1120 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) { 1121 lookaheadStatus = 0; 1124 } 1125 1126 } 1128 1130 if (result == initialPosition) { 1134 result = fText.setIndex(initialPosition); 1135 CINext32(fText); 1136 result = fText.getIndex(); 1137 } 1138 1139 fText.setIndex(result); 1143 if (fTrace) { 1144 System.out.println("result = " + result); 1145 } 1146 return result; 1147 } 1148 1149 1150 1151 private int handlePrevious(short stateTable[]) { 1152 int state; 1153 int category = 0; 1154 int mode; 1155 int row; 1156 int c; 1157 int lookaheadStatus = 0; 1158 int result = 0; 1159 int initialPosition = 0; 1160 int lookaheadResult = 0; 1161 boolean lookAheadHardBreak = 1162 (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1163 1164 1165 if (fText == null || stateTable == null) { 1166 return 0; 1167 } 1168 fLastStatusIndexValid = false; 1173 fLastRuleStatusIndex = 0; 1174 1175 initialPosition = fText.getIndex(); 1177 result = initialPosition; 1178 c = CIPrevious32(fText); 1179 1180 state = START_STATE; 1182 row = fRData.getRowIndex(state); 1183 category = 3; mode = RBBI_RUN; 1185 if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 1186 category = 2; 1187 mode = RBBI_START; 1188 } 1189 1190 if (fTrace) { 1191 System.out.println("Handle Prev pos char state category "); 1192 } 1193 1194 mainLoop: for (;;) { 1197 innerBlock: { 1198 if (c == CI_DONE32) { 1199 if (mode == RBBI_END || fRData.fHeader.fVersion == 1) { 1201 if (lookaheadResult < result) { 1206 result = lookaheadResult; 1210 lookaheadStatus = 0; 1211 } else if (result == initialPosition) { 1212 fText.setIndex(initialPosition); 1215 CIPrevious32(fText); 1216 } 1217 break mainLoop; 1218 } 1219 mode = RBBI_END; 1220 category = 1; 1221 } 1222 1223 if (mode == RBBI_RUN) { 1224 category = (short) fRData.fTrie.getCodePointValue(c); 1228 1229 if ((category & 0x4000) != 0) { 1235 fDictionaryCharCount++; 1236 category &= ~0x4000; 1238 } 1239 } 1240 1241 1242 if (fTrace) { 1243 System.out.print(" " + fText.getIndex() + " "); 1244 if (0x20 <= c && c < 0x7f) { 1245 System.out.print(" " + c + " "); 1246 } else { 1247 System.out.print(" " + Integer.toHexString(c) + " "); 1248 } 1249 System.out.println(" " + state + " " + category + " "); 1250 } 1251 1252 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1255 row = fRData.getRowIndex(state); 1256 1257 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { 1258 result = fText.getIndex(); 1261 } 1262 1263 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) { 1264 if (lookaheadStatus != 0 1265 && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) { 1266 result = lookaheadResult; 1271 lookaheadStatus = 0; 1272 1274 if (lookAheadHardBreak) { 1275 break mainLoop; 1276 } 1277 break innerBlock; 1281 } 1282 lookaheadResult = fText.getIndex(); 1285 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 1286 break innerBlock; 1287 } 1288 1289 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) { 1291 if (!lookAheadHardBreak) { 1293 lookaheadStatus = 0; 1301 } 1302 } 1303 1304 } 1306 1307 if (state == STOP_STATE) { 1308 break mainLoop; 1310 } 1311 1312 if (mode == RBBI_RUN) { 1315 c = CIPrevious32(fText); 1316 } else { 1317 if (mode == RBBI_START) { 1318 mode = RBBI_RUN; 1319 } 1320 } 1321 1322 1323 } 1325 if (result == initialPosition) { 1331 result = fText.setIndex(initialPosition); 1332 CIPrevious32(fText); 1333 result = fText.getIndex(); 1334 } 1335 1336 fText.setIndex(result); 1337 if (fTrace) { 1338 System.out.println("Result = " + result); 1339 } 1340 1341 return result; 1342 } 1343 1344 1345 1346 1347 1348 1350 1352 1354 1356 1358 1360 1362 1364 1366 1368 boolean isDictionaryChar(int c) { 1369 1370 short category = (short) fRData.fTrie.getCodePointValue(c); 1371 1372 return (category & 0x4000) != 0; 1373 1374 } 1375 1376} 1377 | Popular Tags |