1 7 8 package com.ibm.icu.text; 9 10 import java.text.CharacterIterator ; 11 import java.text.StringCharacterIterator ; 12 import java.util.Locale ; 13 14 import com.ibm.icu.impl.CharacterIteratorWrapper; 15 import com.ibm.icu.impl.NormalizerImpl; 16 import com.ibm.icu.lang.UCharacter; 17 import com.ibm.icu.util.ULocale; 18 19 147 public final class StringSearch extends SearchIterator 151 { 152 153 155 173 public StringSearch(String pattern, CharacterIterator target, 174 RuleBasedCollator collator, BreakIterator breakiter) 175 { 176 super(target, breakiter); 177 m_textBeginOffset_ = targetText.getBeginIndex(); 178 m_textLimitOffset_ = targetText.getEndIndex(); 179 m_collator_ = collator; 180 m_colEIter_ = m_collator_.getCollationElementIterator(target); 181 m_utilColEIter_ = collator.getCollationElementIterator(""); 182 m_ceMask_ = getMask(m_collator_.getStrength()); 183 m_isCanonicalMatch_ = false; 184 m_pattern_ = new Pattern(pattern); 185 m_matchedIndex_ = DONE; 186 187 initialize(); 188 } 189 190 203 public StringSearch(String pattern, CharacterIterator target, 204 RuleBasedCollator collator) 205 { 206 this(pattern, target, collator, BreakIterator.getCharacterInstance()); 207 } 208 209 226 public StringSearch(String pattern, CharacterIterator target, Locale locale) 227 { 228 this(pattern, target, ULocale.forLocale(locale)); 229 } 230 231 249 public StringSearch(String pattern, CharacterIterator target, ULocale locale) 250 { 251 this(pattern, target, (RuleBasedCollator)Collator.getInstance(locale), 252 BreakIterator.getCharacterInstance(locale)); 253 } 254 255 271 public StringSearch(String pattern, String target) 272 { 273 this(pattern, new StringCharacterIterator(target), 274 (RuleBasedCollator)Collator.getInstance(), 275 BreakIterator.getCharacterInstance()); 276 } 277 278 280 296 public RuleBasedCollator getCollator() 297 { 298 return m_collator_; 299 } 300 301 306 public String getPattern() 307 { 308 return m_pattern_.targetText; 309 } 310 311 320 public int getIndex() 321 { 322 int result = m_colEIter_.getOffset(); 323 if (isOutOfBounds(m_textBeginOffset_, m_textLimitOffset_, result)) { 324 return DONE; 325 } 326 return result; 327 } 328 329 337 public boolean isCanonical() 338 { 339 return m_isCanonicalMatch_; 340 } 341 342 344 357 public void setCollator(RuleBasedCollator collator) 358 { 359 if (collator == null) { 360 throw new IllegalArgumentException ("Collator can not be null"); 361 } 362 m_collator_ = collator; 363 m_ceMask_ = getMask(m_collator_.getStrength()); 364 initialize(); 366 m_colEIter_.setCollator(m_collator_); 367 m_utilColEIter_.setCollator(m_collator_); 368 } 369 370 384 public void setPattern(String pattern) 385 { 386 if (pattern == null || pattern.length() <= 0) { 387 throw new IllegalArgumentException ( 388 "Pattern to search for can not be null or of length 0"); 389 } 390 m_pattern_.targetText = pattern; 391 initialize(); 392 } 393 394 404 public void setTarget(CharacterIterator text) 405 { 406 super.setTarget(text); 407 m_textBeginOffset_ = targetText.getBeginIndex(); 408 m_textLimitOffset_ = targetText.getEndIndex(); 409 m_colEIter_.setText(targetText); 410 } 411 412 432 public void setIndex(int position) 433 { 434 super.setIndex(position); 435 m_matchedIndex_ = DONE; 436 m_colEIter_.setExactOffset(position); 437 } 438 439 448 public void setCanonical(boolean allowCanonical) 449 { 450 m_isCanonicalMatch_ = allowCanonical; 451 if (m_isCanonicalMatch_ == true) { 452 if (m_canonicalPrefixAccents_ == null) { 453 m_canonicalPrefixAccents_ = new StringBuffer (); 454 } 455 else { 456 m_canonicalPrefixAccents_.delete(0, 457 m_canonicalPrefixAccents_.length()); 458 } 459 if (m_canonicalSuffixAccents_ == null) { 460 m_canonicalSuffixAccents_ = new StringBuffer (); 461 } 462 else { 463 m_canonicalSuffixAccents_.delete(0, 464 m_canonicalSuffixAccents_.length()); 465 } 466 } 467 } 468 469 471 487 public void reset() 488 { 489 super.reset(); 493 m_isCanonicalMatch_ = false; 494 m_ceMask_ = getMask(m_collator_.getStrength()); 495 initialize(); 497 m_colEIter_.setCollator(m_collator_); 498 m_colEIter_.reset(); 499 m_utilColEIter_.setCollator(m_collator_); 500 } 501 502 504 518 protected int handleNext(int start) 519 { 520 if (m_pattern_.m_CELength_ == 0) { 521 matchLength = 0; 522 if (m_matchedIndex_ == DONE && start == m_textBeginOffset_) { 523 m_matchedIndex_ = start; 524 return m_matchedIndex_; 525 } 526 527 targetText.setIndex(start); 528 char ch = targetText.current(); 529 char ch2 = targetText.next(); 531 if (ch2 == CharacterIterator.DONE) { 532 m_matchedIndex_ = DONE; 533 } 534 else { 535 m_matchedIndex_ = targetText.getIndex(); 536 } 537 if (UTF16.isLeadSurrogate(ch) && UTF16.isTrailSurrogate(ch2)) { 538 targetText.next(); 539 m_matchedIndex_ = targetText.getIndex(); 540 } 541 } 542 else { 543 if (matchLength <= 0) { 544 if (start == m_textBeginOffset_) { 550 m_matchedIndex_ = DONE; 551 } 552 else { 553 m_matchedIndex_ = start - 1; 558 } 559 } 560 561 if (m_isCanonicalMatch_) { 563 handleNextCanonical(start); 565 } 566 else { 567 handleNextExact(start); 568 } 569 } 570 if (m_matchedIndex_ == DONE) { 571 targetText.setIndex(m_textLimitOffset_); 572 } 573 else { 574 targetText.setIndex(m_matchedIndex_); 575 } 576 return m_matchedIndex_; 577 } 578 579 593 protected int handlePrevious(int start) 594 { 595 if (m_pattern_.m_CELength_ == 0) { 596 matchLength = 0; 597 targetText.setIndex(start); 599 char ch = targetText.previous(); 600 if (ch == CharacterIterator.DONE) { 601 m_matchedIndex_ = DONE; 602 } 603 else { 604 m_matchedIndex_ = targetText.getIndex(); 605 if (UTF16.isTrailSurrogate(ch)) { 606 if (UTF16.isLeadSurrogate(targetText.previous())) { 607 m_matchedIndex_ = targetText.getIndex(); 608 } 609 } 610 } 611 } 612 else { 613 if (matchLength == 0) { 614 m_matchedIndex_ = DONE; 620 } 621 if (m_isCanonicalMatch_) { 622 handlePreviousCanonical(start); 624 } 625 else { 626 handlePreviousExact(start); 627 } 628 } 629 630 if (m_matchedIndex_ == DONE) { 631 targetText.setIndex(m_textBeginOffset_); 632 } 633 else { 634 targetText.setIndex(m_matchedIndex_); 635 } 636 return m_matchedIndex_; 637 } 638 639 641 private static class Pattern 642 { 643 645 648 protected String targetText; 649 652 protected int m_CE_[]; 653 656 protected int m_CELength_; 657 660 protected boolean m_hasPrefixAccents_; 661 664 protected boolean m_hasSuffixAccents_; 665 668 protected int m_defaultShiftSize_; 669 673 protected char m_shift_[]; 674 678 protected char m_backShift_[]; 679 680 682 685 protected Pattern(String pattern) 686 { 687 targetText = pattern; 688 m_CE_ = new int[INITIAL_ARRAY_SIZE_]; 689 m_CELength_ = 0; 690 m_hasPrefixAccents_ = false; 691 m_hasSuffixAccents_ = false; 692 m_defaultShiftSize_ = 1; 693 m_shift_ = new char[MAX_TABLE_SIZE_]; 694 m_backShift_ = new char[MAX_TABLE_SIZE_]; 695 } 696 }; 697 698 699 701 706 private int m_textBeginOffset_; 707 712 private int m_textLimitOffset_; 713 720 private int m_matchedIndex_; 721 724 private Pattern m_pattern_; 725 728 private RuleBasedCollator m_collator_; 729 732 private CollationElementIterator m_colEIter_; 733 737 private CollationElementIterator m_utilColEIter_; 738 742 private int m_ceMask_; 743 746 private StringBuffer m_canonicalPrefixAccents_; 747 750 private StringBuffer m_canonicalSuffixAccents_; 751 755 private boolean m_isCanonicalMatch_; 756 759 private static final int MAX_TABLE_SIZE_ = 257; 760 763 private static final int INITIAL_ARRAY_SIZE_ = 256; 764 767 private static final int SECOND_LAST_BYTE_SHIFT_ = 8; 768 771 private static final int LAST_BYTE_MASK_ = 0xff; 772 775 private int m_utilBuffer_[] = new int[2]; 776 777 779 786 private static final int hash(int ce) 787 { 788 return CollationElementIterator.primaryOrder(ce) % MAX_TABLE_SIZE_; 793 } 794 795 803 private static final char getFCD(CharacterIterator str, int offset) 804 { 805 str.setIndex(offset); 806 char ch = str.current(); 807 char result = NormalizerImpl.getFCD16(ch); 808 809 if ((result != 0) && (str.getEndIndex() != offset + 1) && 810 UTF16.isLeadSurrogate(ch)) { 811 ch = str.next(); 812 if (UTF16.isTrailSurrogate(ch)) { 813 result = NormalizerImpl.getFCD16FromSurrogatePair(result, ch); 814 } else { 815 result = 0; 816 } 817 } 818 return result; 819 } 820 821 828 private static final char getFCD(String str, int offset) 829 { 830 char ch = str.charAt(offset); 831 char result = NormalizerImpl.getFCD16(ch); 832 833 if ((result != 0) && (str.length() != offset + 1) && 834 UTF16.isLeadSurrogate(ch)) { 835 ch = str.charAt(offset + 1); 836 if (UTF16.isTrailSurrogate(ch)) { 837 result = NormalizerImpl.getFCD16FromSurrogatePair(result, ch); 838 } else { 839 result = 0; 840 } 841 } 842 return result; 843 } 844 845 851 private final int getCE(int ce) 852 { 853 ce &= m_ceMask_; 857 858 if (m_collator_.isAlternateHandlingShifted()) { 859 if ((m_collator_.m_variableTopValue_ << 16) > ce) { 865 if (m_collator_.getStrength() == Collator.QUATERNARY) { 866 ce = CollationElementIterator.primaryOrder(ce); 867 } 868 else { 869 ce = CollationElementIterator.IGNORABLE; 870 } 871 } 872 } 873 874 return ce; 875 } 876 877 885 private static final int[] append(int offset, int value, int array[]) 886 { 887 if (offset >= array.length) { 888 int temp[] = new int[offset + INITIAL_ARRAY_SIZE_]; 889 System.arraycopy(array, 0, temp, 0, array.length); 890 array = temp; 891 } 892 array[offset] = value; 893 return array; 894 } 895 896 904 private final int initializePatternCETable() 905 { 906 m_utilColEIter_.setText(m_pattern_.targetText); 907 908 int offset = 0; 909 int result = 0; 910 int ce = m_utilColEIter_.next(); 911 912 while (ce != CollationElementIterator.NULLORDER) { 913 int newce = getCE(ce); 914 if (newce != CollationElementIterator.IGNORABLE) { 915 m_pattern_.m_CE_ = append(offset, newce, m_pattern_.m_CE_); 916 offset ++; 917 } 918 result += m_utilColEIter_.getMaxExpansion(ce) - 1; 919 ce = m_utilColEIter_.next(); 920 } 921 922 m_pattern_.m_CE_ = append(offset, 0, m_pattern_.m_CE_); 923 m_pattern_.m_CELength_ = offset; 924 925 return result; 926 } 927 928 933 private final int initializePattern() 934 { 935 m_pattern_.m_hasPrefixAccents_ = (getFCD(m_pattern_.targetText, 0) 936 >> SECOND_LAST_BYTE_SHIFT_) != 0; 937 m_pattern_.m_hasSuffixAccents_ = (getFCD(m_pattern_.targetText, 938 m_pattern_.targetText.length() 939 - 1) 940 & LAST_BYTE_MASK_) != 0; 941 return initializePatternCETable(); 943 } 944 945 956 private final void setShiftTable(char shift[], 957 char backshift[], 958 int cetable[], int cesize, 959 int expansionsize, 960 char defaultforward, 961 char defaultbackward) 962 { 963 for (int count = 0; count < MAX_TABLE_SIZE_; count ++) { 968 shift[count] = defaultforward; 969 } 970 cesize --; for (int count = 0; count < cesize; count ++) { 972 int temp = defaultforward - count - 1; 974 shift[hash(cetable[count])] = temp > 1 ? ((char)temp) : 1; 975 } 976 shift[hash(cetable[cesize])] = 1; 977 shift[hash(0)] = 1; 979 980 for (int count = 0; count < MAX_TABLE_SIZE_; count ++) { 981 backshift[count] = defaultbackward; 982 } 983 for (int count = cesize; count > 0; count --) { 984 backshift[hash(cetable[count])] = (char)(count > expansionsize ? 986 count - expansionsize : 1); 987 } 988 backshift[hash(cetable[0])] = 1; 989 backshift[hash(0)] = 1; 990 } 991 992 1018 private final void initialize() 1019 { 1020 int expandlength = initializePattern(); 1021 if (m_pattern_.m_CELength_ > 0) { 1022 char minlength = (char)(m_pattern_.m_CELength_ > expandlength 1023 ? m_pattern_.m_CELength_ - expandlength : 1); 1024 m_pattern_.m_defaultShiftSize_ = minlength; 1025 setShiftTable(m_pattern_.m_shift_, m_pattern_.m_backShift_, 1026 m_pattern_.m_CE_, m_pattern_.m_CELength_, 1027 expandlength, minlength, minlength); 1028 } 1029 else { 1030 m_pattern_.m_defaultShiftSize_ = 0; 1031 } 1032 } 1033 1034 1041 private final boolean isBreakUnit(int start, int end) 1042 { 1043 if (breakIterator != null) { 1044 int startindex = breakIterator.first(); 1045 int endindex = breakIterator.last(); 1046 1047 if (start < startindex || start > endindex || end < startindex 1049 || end > endindex) { 1050 return false; 1051 } 1052 boolean result = (start == startindex 1056 || breakIterator.following(start - 1) == start) 1057 && (end == endindex 1058 || breakIterator.following(end - 1) == end); 1059 if (result) { 1060 m_utilColEIter_.setText( 1062 new CharacterIteratorWrapper(targetText), start); 1063 for (int count = 0; count < m_pattern_.m_CELength_; 1064 count ++) { 1065 int ce = getCE(m_utilColEIter_.next()); 1066 if (ce == CollationElementIterator.IGNORABLE) { 1067 count --; 1068 continue; 1069 } 1070 if (ce != m_pattern_.m_CE_[count]) { 1071 return false; 1072 } 1073 } 1074 int nextce = m_utilColEIter_.next(); 1075 while (m_utilColEIter_.getOffset() == end 1076 && getCE(nextce) == CollationElementIterator.IGNORABLE) { 1077 nextce = m_utilColEIter_.next(); 1078 } 1079 if (nextce != CollationElementIterator.NULLORDER 1080 && m_utilColEIter_.getOffset() == end) { 1081 return false; 1083 } 1084 } 1085 return result; 1086 } 1087 return true; 1088 } 1089 1090 1100 private final int getNextBaseOffset(CharacterIterator text, 1101 int textoffset) 1102 { 1103 if (textoffset < text.getEndIndex()) { 1104 while (text.getIndex() < text.getEndIndex()) { 1105 int result = textoffset; 1106 if ((getFCD(text, textoffset ++) 1107 >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1108 return result; 1109 } 1110 } 1111 return text.getEndIndex(); 1112 } 1113 return textoffset; 1114 } 1115 1116 1124 private final int getNextBaseOffset(int textoffset) 1125 { 1126 if (m_pattern_.m_hasSuffixAccents_ 1127 && textoffset < m_textLimitOffset_) { 1128 targetText.setIndex(textoffset); 1129 targetText.previous(); 1130 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) { 1131 return getNextBaseOffset(targetText, textoffset); 1132 } 1133 } 1134 return textoffset; 1135 } 1136 1137 1148 private int shiftForward(int textoffset, int ce, int patternceindex) 1149 1150 { 1151 if (ce != CollationElementIterator.NULLORDER) { 1152 int shift = m_pattern_.m_shift_[hash(ce)]; 1153 int adjust = m_pattern_.m_CELength_ - patternceindex; 1156 if (adjust > 1 && shift >= adjust) { 1157 shift -= adjust - 1; 1158 } 1159 textoffset += shift; 1160 } 1161 else { 1162 textoffset += m_pattern_.m_defaultShiftSize_; 1163 } 1164 1165 textoffset = getNextBaseOffset(textoffset); 1166 return textoffset; 1173 } 1174 1175 1183 private final int getNextSafeOffset(int textoffset, int end) 1184 { 1185 int result = textoffset; targetText.setIndex(result); 1187 while (result != end && 1188 m_collator_.isUnsafe(targetText.current())) { 1189 result ++; 1190 targetText.setIndex(result); 1191 } 1192 return result; 1193 } 1194 1195 1218 private final boolean checkExtraMatchAccents(int start, int end) 1219 { 1220 boolean result = false; 1221 if (m_pattern_.m_hasPrefixAccents_) { 1222 targetText.setIndex(start); 1223 1224 if (UTF16.isLeadSurrogate(targetText.next())) { 1225 if (!UTF16.isTrailSurrogate(targetText.next())) { 1226 targetText.previous(); 1227 } 1228 } 1229 String str = getString(targetText, start, end); 1231 if (Normalizer.quickCheck(str, Normalizer.NFD,0) 1232 == Normalizer.NO) { 1233 int safeoffset = getNextSafeOffset(start, end); 1234 if (safeoffset != end) { 1235 safeoffset ++; 1236 } 1237 String decomp = Normalizer.decompose( 1238 str.substring(0, safeoffset - start), false); 1239 m_utilColEIter_.setText(decomp); 1240 int firstce = m_pattern_.m_CE_[0]; 1241 boolean ignorable = true; 1242 int ce = CollationElementIterator.IGNORABLE; 1243 int offset = 0; 1244 while (ce != firstce) { 1245 offset = m_utilColEIter_.getOffset(); 1246 if (ce != firstce 1247 && ce != CollationElementIterator.IGNORABLE) { 1248 ignorable = false; 1249 } 1250 ce = m_utilColEIter_.next(); 1251 } 1252 m_utilColEIter_.setExactOffset(offset); m_utilColEIter_.previous(); offset = m_utilColEIter_.getOffset(); 1255 result = !ignorable && (UCharacter.getCombiningClass( 1256 UTF16.charAt(decomp, offset)) != 0); 1257 } 1258 } 1259 1260 return result; 1261 } 1262 1263 1284 private final boolean hasAccentsBeforeMatch(int start, int end) 1285 { 1286 if (m_pattern_.m_hasPrefixAccents_) { 1287 boolean ignorable = true; 1289 int firstce = m_pattern_.m_CE_[0]; 1290 m_colEIter_.setExactOffset(start); 1291 int ce = getCE(m_colEIter_.next()); 1292 while (ce != firstce) { 1293 if (ce != CollationElementIterator.IGNORABLE) { 1294 ignorable = false; 1295 } 1296 ce = getCE(m_colEIter_.next()); 1297 } 1298 if (!ignorable && m_colEIter_.isInBuffer()) { 1299 return true; 1301 } 1302 1303 boolean accent = (getFCD(targetText, start) >> SECOND_LAST_BYTE_SHIFT_) 1305 != 0; 1306 if (!accent) { 1307 return checkExtraMatchAccents(start, end); 1308 } 1309 if (!ignorable) { 1310 return true; 1311 } 1312 if (start > m_textBeginOffset_) { 1313 targetText.setIndex(start); 1314 targetText.previous(); 1315 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) 1316 != 0) { 1317 m_colEIter_.setExactOffset(start); 1318 ce = m_colEIter_.previous(); 1319 if (ce != CollationElementIterator.NULLORDER 1320 && ce != CollationElementIterator.IGNORABLE) { 1321 return true; 1322 } 1323 } 1324 } 1325 } 1326 1327 return false; 1328 } 1329 1330 1345 private final boolean hasAccentsAfterMatch(int start, int end) 1346 { 1347 if (m_pattern_.m_hasSuffixAccents_) { 1348 targetText.setIndex(end); 1349 if (end > m_textBeginOffset_ 1350 && UTF16.isTrailSurrogate(targetText.previous())) { 1351 if (targetText.getIndex() > m_textBeginOffset_ && 1352 !UTF16.isLeadSurrogate(targetText.previous())) { 1353 targetText.next(); 1354 } 1355 } 1356 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) { 1357 int firstce = m_pattern_.m_CE_[0]; 1358 m_colEIter_.setExactOffset(start); 1359 while (getCE(m_colEIter_.next()) != firstce) { 1360 } 1361 int count = 1; 1362 while (count < m_pattern_.m_CELength_) { 1363 if (getCE(m_colEIter_.next()) 1364 == CollationElementIterator.IGNORABLE) { 1365 count --; 1366 } 1367 count ++; 1368 } 1369 int ce = getCE(m_colEIter_.next()); 1370 if (ce != CollationElementIterator.NULLORDER 1371 && ce != CollationElementIterator.IGNORABLE) { 1372 if (m_colEIter_.getOffset() <= end) { 1373 return true; 1374 } 1375 if ((getFCD(targetText, end) >> SECOND_LAST_BYTE_SHIFT_) 1376 != 0) { 1377 return true; 1378 } 1379 } 1380 } 1381 } 1382 return false; 1383 } 1384 1385 1392 private static final boolean isOutOfBounds(int textstart, int textlimit, 1393 int offset) 1394 { 1395 return offset < textstart || offset > textlimit; 1396 } 1397 1398 1405 private final boolean checkIdentical(int start, int end) 1406 { 1407 if (m_collator_.getStrength() != Collator.IDENTICAL) { 1408 return true; 1409 } 1410 1411 String textstr = getString(targetText, start, end - start); 1412 if (Normalizer.quickCheck(textstr, Normalizer.NFD,0) 1413 == Normalizer.NO) { 1414 textstr = Normalizer.decompose(textstr, false); 1415 } 1416 String patternstr = m_pattern_.targetText; 1417 if (Normalizer.quickCheck(patternstr, Normalizer.NFD,0) 1418 == Normalizer.NO) { 1419 patternstr = Normalizer.decompose(patternstr, false); 1420 } 1421 return textstr.equals(patternstr); 1422 } 1423 1424 1430 private final boolean checkRepeatedMatch(int start, int limit) 1431 { 1432 if (m_matchedIndex_ == DONE) { 1433 return false; 1434 } 1435 int end = limit - 1; int lastmatchend = m_matchedIndex_ + matchLength - 1; 1437 if (!isOverlapping()) { 1438 return (start >= m_matchedIndex_ && start <= lastmatchend) 1439 || (end >= m_matchedIndex_ && end <= lastmatchend) 1440 || (start <= m_matchedIndex_ && end >= lastmatchend); 1441 1442 } 1443 return start <= m_matchedIndex_ && end >= lastmatchend; 1444 } 1445 1446 1457 private final boolean checkNextExactContractionMatch(int start, int end) 1458 { 1459 char endchar = 0; 1462 if (end < m_textLimitOffset_) { 1463 targetText.setIndex(end); 1464 endchar = targetText.current(); 1465 } 1466 char poststartchar = 0; 1467 if (start + 1 < m_textLimitOffset_) { 1468 targetText.setIndex(start + 1); 1469 poststartchar = targetText.current(); 1470 } 1471 if (m_collator_.isUnsafe(endchar) 1472 || m_collator_.isUnsafe(poststartchar)) { 1473 int bufferedCEOffset = m_colEIter_.m_CEBufferOffset_; 1475 boolean hasBufferedCE = bufferedCEOffset > 0; 1476 m_colEIter_.setExactOffset(start); 1477 int temp = start; 1478 while (bufferedCEOffset > 0) { 1479 m_colEIter_.next(); 1487 if (m_colEIter_.getOffset() != temp) { 1488 start = temp; 1489 temp = m_colEIter_.getOffset(); 1490 } 1491 bufferedCEOffset --; 1492 } 1493 1494 int count = 0; 1495 while (count < m_pattern_.m_CELength_) { 1496 int ce = getCE(m_colEIter_.next()); 1497 if (ce == CollationElementIterator.IGNORABLE) { 1498 continue; 1499 } 1500 if (hasBufferedCE && count == 0 1501 && m_colEIter_.getOffset() != temp) { 1502 start = temp; 1503 temp = m_colEIter_.getOffset(); 1504 } 1505 if (ce != m_pattern_.m_CE_[count]) { 1506 end ++; 1507 end = getNextBaseOffset(end); 1508 m_utilBuffer_[0] = start; 1509 m_utilBuffer_[1] = end; 1510 return false; 1511 } 1512 count ++; 1513 } 1514 } 1515 m_utilBuffer_[0] = start; 1516 m_utilBuffer_[1] = end; 1517 return true; 1518 } 1519 1520 1521 1538 private final boolean checkNextExactMatch(int textoffset) 1539 { 1540 int start = m_colEIter_.getOffset(); 1541 if (!checkNextExactContractionMatch(start, textoffset)) { 1542 m_utilBuffer_[0] = m_utilBuffer_[1]; 1544 return false; 1545 } 1546 1547 start = m_utilBuffer_[0]; 1548 textoffset = m_utilBuffer_[1]; 1549 if (!isBreakUnit(start, textoffset) 1551 || checkRepeatedMatch(start, textoffset) 1552 || hasAccentsBeforeMatch(start, textoffset) 1553 || !checkIdentical(start, textoffset) 1554 || hasAccentsAfterMatch(start, textoffset)) { 1555 textoffset ++; 1556 textoffset = getNextBaseOffset(textoffset); 1557 m_utilBuffer_[0] = textoffset; 1558 return false; 1559 } 1560 1561 m_matchedIndex_ = start; 1563 matchLength = textoffset - start; 1564 return true; 1565 } 1566 1567 1575 private final int getPreviousBaseOffset(CharacterIterator text, 1576 int textoffset) 1577 { 1578 if (textoffset > m_textBeginOffset_) { 1579 while (true) { 1580 int result = textoffset; 1581 text.setIndex(result); 1582 if (UTF16.isTrailSurrogate(text.previous())) { 1583 if (text.getIndex() != text.getBeginIndex() && 1584 !UTF16.isLeadSurrogate(text.previous())) { 1585 text.next(); 1586 } 1587 } 1588 textoffset = text.getIndex(); 1589 char fcd = getFCD(text, textoffset); 1590 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1591 if ((fcd & LAST_BYTE_MASK_) != 0) { 1592 return textoffset; 1593 } 1594 return result; 1595 } 1596 if (textoffset == m_textBeginOffset_) { 1597 return m_textBeginOffset_; 1598 } 1599 } 1600 } 1601 return textoffset; 1602 } 1603 1604 1612 private int getUnblockedAccentIndex(StringBuffer accents, 1613 int accentsindex[]) 1614 { 1615 int index = 0; 1616 int length = accents.length(); 1617 int cclass = 0; 1618 int result = 0; 1619 while (index < length) { 1620 int codepoint = UTF16.charAt(accents, index); 1621 int tempclass = UCharacter.getCombiningClass(codepoint); 1622 if (tempclass != cclass) { 1623 cclass = tempclass; 1624 accentsindex[result] = index; 1625 result ++; 1626 } 1627 if (UCharacter.isSupplementary(codepoint)) { 1628 index += 2; 1629 } 1630 else { 1631 index ++; 1632 } 1633 } 1634 accentsindex[result] = length; 1635 return result; 1636 } 1637 1638 1648 private static final StringBuffer merge(StringBuffer source1, 1649 CharacterIterator source2, 1650 int start2, int end2, 1651 StringBuffer source3) 1652 { 1653 StringBuffer result = new StringBuffer (); 1654 if (source1 != null && source1.length() != 0) { 1655 if(com.ibm.icu.impl.ICUDebug.isJDK14OrHigher){ 1657 result.append(source1); 1658 }else{ 1659 result.append(source1.toString()); 1660 } 1661 } 1662 source2.setIndex(start2); 1663 while (source2.getIndex() < end2) { 1664 result.append(source2.current()); 1665 source2.next(); 1666 } 1667 if (source3 != null && source3.length() != 0) { 1668 if(com.ibm.icu.impl.ICUDebug.isJDK14OrHigher){ 1670 result.append(source3); 1671 }else{ 1672 result.append(source3.toString()); 1673 } 1674 } 1675 return result; 1676 } 1677 1678 1684 private final boolean checkCollationMatch(CollationElementIterator coleiter) 1685 { 1686 int patternceindex = m_pattern_.m_CELength_; 1687 int offset = 0; 1688 while (patternceindex > 0) { 1689 int ce = getCE(coleiter.next()); 1690 if (ce == CollationElementIterator.IGNORABLE) { 1691 continue; 1692 } 1693 if (ce != m_pattern_.m_CE_[offset]) { 1694 return false; 1695 } 1696 offset ++; 1697 patternceindex --; 1698 } 1699 return true; 1700 } 1701 1702 1720 private int doNextCanonicalPrefixMatch(int start, int end) 1721 { 1722 if ((getFCD(targetText, start) & LAST_BYTE_MASK_) == 0) { 1723 return DONE; 1725 } 1726 1727 start = targetText.getIndex(); int offset = getNextBaseOffset(targetText, start); 1729 start = getPreviousBaseOffset(start); 1730 1731 StringBuffer accents = new StringBuffer (); 1732 String accentstr = getString(targetText, start, offset - start); 1733 if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0) 1735 == Normalizer.NO) { 1736 accentstr = Normalizer.decompose(accentstr, false); 1737 } 1738 accents.append(accentstr); 1739 1740 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_]; 1741 int accentsize = getUnblockedAccentIndex(accents, accentsindex); 1742 int count = (2 << (accentsize - 1)) - 1; 1743 while (count > 0) { 1744 m_canonicalPrefixAccents_.delete(0, 1746 m_canonicalPrefixAccents_.length()); 1747 int k = 0; 1748 for (; k < accentsindex[0]; k ++) { 1749 m_canonicalPrefixAccents_.append(accents.charAt(k)); 1750 } 1751 for (int i = 0; i <= accentsize - 1; i ++) { 1754 int mask = 1 << (accentsize - i - 1); 1755 if ((count & mask) != 0) { 1756 for (int j = accentsindex[i]; j < accentsindex[i + 1]; 1757 j ++) { 1758 m_canonicalPrefixAccents_.append(accents.charAt(j)); 1759 } 1760 } 1761 } 1762 StringBuffer match = merge(m_canonicalPrefixAccents_, 1763 targetText, offset, end, 1764 m_canonicalSuffixAccents_); 1765 1766 m_utilColEIter_.setText(match.toString()); 1769 if (checkCollationMatch(m_utilColEIter_)) { 1770 return start; 1771 } 1772 count --; 1773 } 1774 return DONE; 1775 } 1776 1777 1785 private final int getPreviousSafeOffset(int start, int textoffset) 1786 { 1787 int result = textoffset; targetText.setIndex(textoffset); 1789 while (result >= start && m_collator_.isUnsafe(targetText.previous())) { 1790 result = targetText.getIndex(); 1791 } 1792 if (result != start) { 1793 result = targetText.getIndex(); } 1796 return result; 1797 } 1798 1799 1813 private int doNextCanonicalSuffixMatch(int textoffset) 1814 { 1815 int safelength = 0; 1816 StringBuffer safetext; 1817 int safeoffset = m_textBeginOffset_; 1818 1819 if (textoffset != m_textBeginOffset_ 1820 && m_canonicalSuffixAccents_.length() > 0 1821 && m_collator_.isUnsafe(m_canonicalSuffixAccents_.charAt(0))) { 1822 safeoffset = getPreviousSafeOffset(m_textBeginOffset_, 1823 textoffset); 1824 safelength = textoffset - safeoffset; 1825 safetext = merge(null, targetText, safeoffset, textoffset, 1826 m_canonicalSuffixAccents_); 1827 } 1828 else { 1829 safetext = m_canonicalSuffixAccents_; 1830 } 1831 1832 CollationElementIterator coleiter = m_utilColEIter_; 1834 coleiter.setText(safetext.toString()); 1835 1837 int ceindex = m_pattern_.m_CELength_ - 1; 1838 boolean isSafe = true; 1840 while (ceindex >= 0) { 1841 int textce = coleiter.previous(); 1842 if (textce == CollationElementIterator.NULLORDER) { 1843 if (coleiter == m_colEIter_) { 1845 return DONE; 1846 } 1847 coleiter = m_colEIter_; 1848 if (safetext != m_canonicalSuffixAccents_) { 1849 safetext.delete(0, safetext.length()); 1850 } 1851 coleiter.setExactOffset(safeoffset); 1852 isSafe = false; 1854 continue; 1855 } 1856 textce = getCE(textce); 1857 if (textce != CollationElementIterator.IGNORABLE 1858 && textce != m_pattern_.m_CE_[ceindex]) { 1859 int failedoffset = coleiter.getOffset(); 1861 if (isSafe && failedoffset >= safelength) { 1862 return DONE; 1864 } 1865 else { 1866 if (isSafe) { 1867 failedoffset += safeoffset; 1868 } 1869 1870 int result = doNextCanonicalPrefixMatch(failedoffset, 1872 textoffset); 1873 if (result != DONE) { 1874 m_colEIter_.setExactOffset(result); 1876 } 1877 return result; 1878 } 1879 } 1880 if (textce == m_pattern_.m_CE_[ceindex]) { 1881 ceindex --; 1882 } 1883 } 1884 if (isSafe) { 1886 int result = coleiter.getOffset(); 1887 int leftoverces = coleiter.m_CEBufferOffset_; 1889 if (result >= safelength) { 1890 result = textoffset; 1891 } 1892 else { 1893 result += safeoffset; 1894 } 1895 m_colEIter_.setExactOffset(result); 1896 m_colEIter_.m_CEBufferOffset_ = leftoverces; 1897 return result; 1898 } 1899 1900 return coleiter.getOffset(); 1901 } 1902 1903 1921 private boolean doNextCanonicalMatch(int textoffset) 1922 { 1923 int offset = m_colEIter_.getOffset(); 1924 targetText.setIndex(textoffset); 1925 if (UTF16.isTrailSurrogate(targetText.previous()) 1926 && targetText.getIndex() > m_textBeginOffset_) { 1927 if (!UTF16.isLeadSurrogate(targetText.previous())) { 1928 targetText.next(); 1929 } 1930 } 1931 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) { 1932 if (m_pattern_.m_hasPrefixAccents_) { 1933 offset = doNextCanonicalPrefixMatch(offset, textoffset); 1934 if (offset != DONE) { 1935 m_colEIter_.setExactOffset(offset); 1936 return true; 1937 } 1938 } 1939 return false; 1940 } 1941 1942 if (!m_pattern_.m_hasSuffixAccents_) { 1943 return false; 1944 } 1945 1946 StringBuffer accents = new StringBuffer (); 1947 int baseoffset = getPreviousBaseOffset(targetText, textoffset); 1949 String accentstr = getString(targetText, baseoffset, 1951 textoffset - baseoffset); 1952 if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0) 1953 == Normalizer.NO) { 1954 accentstr = Normalizer.decompose(accentstr, false); 1955 } 1956 accents.append(accentstr); 1957 1959 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_]; 1960 int size = getUnblockedAccentIndex(accents, accentsindex); 1961 1962 int count = (2 << (size - 1)) - 1; 1964 while (count > 0) { 1965 m_canonicalSuffixAccents_.delete(0, 1966 m_canonicalSuffixAccents_.length()); 1967 for (int k = 0; k < accentsindex[0]; k ++) { 1969 m_canonicalSuffixAccents_.append(accents.charAt(k)); 1970 } 1971 for (int i = 0; i <= size - 1; i ++) { 1974 int mask = 1 << (size - i - 1); 1975 if ((count & mask) != 0) { 1976 for (int j = accentsindex[i]; j < accentsindex[i + 1]; 1977 j ++) { 1978 m_canonicalSuffixAccents_.append(accents.charAt(j)); 1979 } 1980 } 1981 } 1982 offset = doNextCanonicalSuffixMatch(baseoffset); 1983 if (offset != DONE) { 1984 return true; } 1986 count --; 1987 } 1988 return false; 1989 } 1990 1991 1999 private final int getPreviousBaseOffset(int textoffset) 2000 { 2001 if (m_pattern_.m_hasPrefixAccents_ && textoffset > m_textBeginOffset_) { 2002 int offset = textoffset; 2003 if ((getFCD(targetText, offset) >> SECOND_LAST_BYTE_SHIFT_) != 0) { 2004 return getPreviousBaseOffset(targetText, textoffset); 2005 } 2006 } 2007 return textoffset; 2008 } 2009 2010 2021 private boolean checkNextCanonicalContractionMatch(int start, int end) 2022 { 2023 char schar = 0; 2026 char echar = 0; 2027 if (end < m_textLimitOffset_) { 2028 targetText.setIndex(end); 2029 echar = targetText.current(); 2030 } 2031 if (start < m_textLimitOffset_) { 2032 targetText.setIndex(start + 1); 2033 schar = targetText.current(); 2034 } 2035 if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) { 2036 int expansion = m_colEIter_.m_CEBufferOffset_; 2037 boolean hasExpansion = expansion > 0; 2038 m_colEIter_.setExactOffset(start); 2039 int temp = start; 2040 while (expansion > 0) { 2041 m_colEIter_.next(); 2049 if (m_colEIter_.getOffset() != temp) { 2050 start = temp; 2051 temp = m_colEIter_.getOffset(); 2052 } 2053 expansion --; 2054 } 2055 2056 int count = 0; 2057 while (count < m_pattern_.m_CELength_) { 2058 int ce = getCE(m_colEIter_.next()); 2059 if (ce == CollationElementIterator.IGNORABLE) { 2062 continue; 2063 } 2064 if (hasExpansion && count == 0 2065 && m_colEIter_.getOffset() != temp) { 2066 start = temp; 2067 temp = m_colEIter_.getOffset(); 2068 } 2069 2070 if (count == 0 && ce != m_pattern_.m_CE_[0]) { 2071 int expected = m_pattern_.m_CE_[0]; 2075 if ((getFCD(targetText, start) & LAST_BYTE_MASK_) != 0) { 2076 ce = getCE(m_colEIter_.next()); 2077 while (ce != expected 2078 && ce != CollationElementIterator.NULLORDER 2079 && m_colEIter_.getOffset() <= end) { 2080 ce = getCE(m_colEIter_.next()); 2081 } 2082 } 2083 } 2084 if (ce != m_pattern_.m_CE_[count]) { 2085 end ++; 2086 end = getNextBaseOffset(end); 2087 m_utilBuffer_[0] = start; 2088 m_utilBuffer_[1] = end; 2089 return false; 2090 } 2091 count ++; 2092 } 2093 } 2094 m_utilBuffer_[0] = start; 2095 m_utilBuffer_[1] = end; 2096 return true; 2097 } 2098 2099 2115 private boolean checkNextCanonicalMatch(int textoffset) 2116 { 2117 if ((m_pattern_.m_hasSuffixAccents_ 2120 && m_canonicalSuffixAccents_.length() != 0) || 2121 (m_pattern_.m_hasPrefixAccents_ 2122 && m_canonicalPrefixAccents_.length() != 0)) { 2123 m_matchedIndex_ = getPreviousBaseOffset(m_colEIter_.getOffset()); 2124 matchLength = textoffset - m_matchedIndex_; 2125 return true; 2126 } 2127 2128 int start = m_colEIter_.getOffset(); 2129 if (!checkNextCanonicalContractionMatch(start, textoffset)) { 2130 m_utilBuffer_[0] = m_utilBuffer_[1]; 2132 return false; 2133 } 2134 start = m_utilBuffer_[0]; 2135 textoffset = m_utilBuffer_[1]; 2136 start = getPreviousBaseOffset(start); 2137 if (checkRepeatedMatch(start, textoffset) 2139 || !isBreakUnit(start, textoffset) 2140 || !checkIdentical(start, textoffset)) { 2141 textoffset ++; 2142 textoffset = getNextBaseOffset(targetText, textoffset); 2143 m_utilBuffer_[0] = textoffset; 2144 return false; 2145 } 2146 2147 m_matchedIndex_ = start; 2148 matchLength = textoffset - start; 2149 return true; 2150 } 2151 2152 2162 private int reverseShift(int textoffset, int ce, int patternceindex) 2163 { 2164 if (isOverlapping()) { 2165 if (textoffset != m_textLimitOffset_) { 2166 textoffset --; 2167 } 2168 else { 2169 textoffset -= m_pattern_.m_defaultShiftSize_; 2170 } 2171 } 2172 else { 2173 if (ce != CollationElementIterator.NULLORDER) { 2174 int shift = m_pattern_.m_backShift_[hash(ce)]; 2175 2176 int adjust = patternceindex; 2179 if (adjust > 1 && shift > adjust) { 2180 shift -= adjust - 1; 2181 } 2182 textoffset -= shift; 2183 } 2184 else { 2185 textoffset -= m_pattern_.m_defaultShiftSize_; 2186 } 2187 } 2188 2189 textoffset = getPreviousBaseOffset(textoffset); 2190 return textoffset; 2191 } 2192 2193 2201 private boolean checkPreviousExactContractionMatch(int start, int end) 2202 { 2203 char echar = 0; 2206 if (end < m_textLimitOffset_) { 2207 targetText.setIndex(end); 2208 echar = targetText.current(); 2209 } 2210 char schar = 0; 2211 if (start + 1 < m_textLimitOffset_) { 2212 targetText.setIndex(start + 1); 2213 schar = targetText.current(); 2214 } 2215 if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) { 2216 int expansion = m_colEIter_.m_CEBufferSize_ 2218 - m_colEIter_.m_CEBufferOffset_; 2219 boolean hasExpansion = expansion > 0; 2220 m_colEIter_.setExactOffset(end); 2221 int temp = end; 2222 while (expansion > 0) { 2223 m_colEIter_.previous(); 2231 if (m_colEIter_.getOffset() != temp) { 2232 end = temp; 2233 temp = m_colEIter_.getOffset(); 2234 } 2235 expansion --; 2236 } 2237 2238 int count = m_pattern_.m_CELength_; 2239 while (count > 0) { 2240 int ce = getCE(m_colEIter_.previous()); 2241 if (ce == CollationElementIterator.IGNORABLE) { 2244 continue; 2245 } 2246 if (hasExpansion && count == 0 2247 && m_colEIter_.getOffset() != temp) { 2248 end = temp; 2249 temp = m_colEIter_.getOffset(); 2250 } 2251 if (ce != m_pattern_.m_CE_[count - 1]) { 2252 start --; 2253 start = getPreviousBaseOffset(targetText, start); 2254 m_utilBuffer_[0] = start; 2255 m_utilBuffer_[1] = end; 2256 return false; 2257 } 2258 count --; 2259 } 2260 } 2261 m_utilBuffer_[0] = start; 2262 m_utilBuffer_[1] = end; 2263 return true; 2264 } 2265 2266 2282 private final boolean checkPreviousExactMatch(int textoffset) 2283 { 2284 int end = m_colEIter_.getOffset(); 2286 if (!checkPreviousExactContractionMatch(textoffset, end)) { 2287 return false; 2288 } 2289 textoffset = m_utilBuffer_[0]; 2290 end = m_utilBuffer_[1]; 2291 2292 if (checkRepeatedMatch(textoffset, end) 2295 || !isBreakUnit(textoffset, end) 2296 || hasAccentsBeforeMatch(textoffset, end) 2297 || !checkIdentical(textoffset, end) 2298 || hasAccentsAfterMatch(textoffset, end)) { 2299 textoffset --; 2300 textoffset = getPreviousBaseOffset(targetText, textoffset); 2301 m_utilBuffer_[0] = textoffset; 2302 return false; 2303 } 2304 m_matchedIndex_ = textoffset; 2305 matchLength = end - textoffset; 2306 return true; 2307 } 2308 2309 2326 private int doPreviousCanonicalSuffixMatch(int start, int end) 2327 { 2328 targetText.setIndex(end); 2329 if (UTF16.isTrailSurrogate(targetText.previous()) 2330 && targetText.getIndex() > m_textBeginOffset_) { 2331 if (!UTF16.isLeadSurrogate(targetText.previous())) { 2332 targetText.next(); 2333 } 2334 } 2335 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) { 2336 return DONE; 2338 } 2339 end = getNextBaseOffset(targetText, end); 2340 2341 StringBuffer accents = new StringBuffer (); 2342 int offset = getPreviousBaseOffset(targetText, end); 2343 String accentstr = getString(targetText, offset, end - offset); 2345 if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0) 2346 == Normalizer.NO) { 2347 accentstr = Normalizer.decompose(accentstr, false); 2348 } 2349 accents.append(accentstr); 2350 2351 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_]; 2352 int accentsize = getUnblockedAccentIndex(accents, accentsindex); 2353 int count = (2 << (accentsize - 1)) - 1; 2354 while (count > 0) { 2355 m_canonicalSuffixAccents_.delete(0, 2356 m_canonicalSuffixAccents_.length()); 2357 for (int k = 0; k < accentsindex[0]; k ++) { 2359 m_canonicalSuffixAccents_.append(accents.charAt(k)); 2360 } 2361 for (int i = 0; i <= accentsize - 1; i ++) { 2364 int mask = 1 << (accentsize - i - 1); 2365 if ((count & mask) != 0) { 2366 for (int j = accentsindex[i]; j < accentsindex[i + 1]; 2367 j ++) { 2368 m_canonicalSuffixAccents_.append(accents.charAt(j)); 2369 } 2370 } 2371 } 2372 StringBuffer match = merge(m_canonicalPrefixAccents_, targetText, 2373 start, offset, 2374 m_canonicalSuffixAccents_); 2375 m_utilColEIter_.setText(match.toString()); 2378 if (checkCollationMatch(m_utilColEIter_)) { 2379 return end; 2380 } 2381 count --; 2382 } 2383 return DONE; 2384 } 2385 2386 2399 private int doPreviousCanonicalPrefixMatch(int textoffset) 2400 { 2401 StringBuffer safetext; 2403 int safeoffset = textoffset; 2404 2405 if (textoffset > m_textBeginOffset_ 2406 && m_collator_.isUnsafe(m_canonicalPrefixAccents_.charAt( 2407 m_canonicalPrefixAccents_.length() - 1))) { 2408 safeoffset = getNextSafeOffset(textoffset, m_textLimitOffset_); 2409 safetext = merge(m_canonicalPrefixAccents_, targetText, textoffset, 2411 safeoffset, null); 2412 } 2413 else { 2414 safetext = m_canonicalPrefixAccents_; 2415 } 2416 2417 CollationElementIterator coleiter = m_utilColEIter_; 2419 coleiter.setText(safetext.toString()); 2420 2422 int ceindex = 0; 2423 boolean isSafe = true; int prefixlength = m_canonicalPrefixAccents_.length(); 2425 2426 while (ceindex < m_pattern_.m_CELength_) { 2427 int textce = coleiter.next(); 2428 if (textce == CollationElementIterator.NULLORDER) { 2429 if (coleiter == m_colEIter_) { 2431 return DONE; 2432 } 2433 if (safetext != m_canonicalPrefixAccents_) { 2434 safetext.delete(0, safetext.length()); 2435 } 2436 coleiter = m_colEIter_; 2437 coleiter.setExactOffset(safeoffset); 2438 isSafe = false; 2440 continue; 2441 } 2442 textce = getCE(textce); 2443 if (textce != CollationElementIterator.IGNORABLE 2444 && textce != m_pattern_.m_CE_[ceindex]) { 2445 int failedoffset = coleiter.getOffset(); 2447 if (isSafe && failedoffset <= prefixlength) { 2448 return DONE; 2450 } 2451 else { 2452 if (isSafe) { 2453 failedoffset = safeoffset - failedoffset; 2454 if (safetext != m_canonicalPrefixAccents_) { 2455 safetext.delete(0, safetext.length()); 2456 } 2457 } 2458 2459 int result = doPreviousCanonicalSuffixMatch(textoffset, 2461 failedoffset); 2462 if (result != DONE) { 2463 m_colEIter_.setExactOffset(result); 2465 } 2466 return result; 2467 } 2468 } 2469 if (textce == m_pattern_.m_CE_[ceindex]) { 2470 ceindex ++; 2471 } 2472 } 2473 if (isSafe) { 2475 int result = coleiter.getOffset(); 2476 int leftoverces = coleiter.m_CEBufferSize_ 2478 - coleiter.m_CEBufferOffset_; 2479 if (result <= prefixlength) { 2480 result = textoffset; 2481 } 2482 else { 2483 result = textoffset + (safeoffset - result); 2484 } 2485 m_colEIter_.setExactOffset(result); 2486 m_colEIter_.m_CEBufferOffset_ = m_colEIter_.m_CEBufferSize_ 2487 - leftoverces; 2488 return result; 2489 } 2490 2491 return coleiter.getOffset(); 2492 } 2493 2494 2512 private boolean doPreviousCanonicalMatch(int textoffset) 2513 { 2514 int offset = m_colEIter_.getOffset(); 2515 if ((getFCD(targetText, textoffset) >> SECOND_LAST_BYTE_SHIFT_) == 0) { 2516 if (m_pattern_.m_hasSuffixAccents_) { 2517 offset = doPreviousCanonicalSuffixMatch(textoffset, offset); 2518 if (offset != DONE) { 2519 m_colEIter_.setExactOffset(offset); 2520 return true; 2521 } 2522 } 2523 return false; 2524 } 2525 2526 if (!m_pattern_.m_hasPrefixAccents_) { 2527 return false; 2528 } 2529 2530 StringBuffer accents = new StringBuffer (); 2531 int baseoffset = getNextBaseOffset(targetText, textoffset); 2533 String textstr = getString(targetText, textoffset, 2535 baseoffset - textoffset); 2536 if (Normalizer.quickCheck(textstr, Normalizer.NFD,0) 2537 == Normalizer.NO) { 2538 textstr = Normalizer.decompose(textstr, false); 2539 } 2540 accents.append(textstr); 2541 2543 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_]; 2544 int size = getUnblockedAccentIndex(accents, accentsindex); 2545 2546 int count = (2 << (size - 1)) - 1; 2548 while (count > 0) { 2549 m_canonicalPrefixAccents_.delete(0, 2550 m_canonicalPrefixAccents_.length()); 2551 for (int k = 0; k < accentsindex[0]; k ++) { 2553 m_canonicalPrefixAccents_.append(accents.charAt(k)); 2554 } 2555 for (int i = 0; i <= size - 1; i ++) { 2558 int mask = 1 << (size - i - 1); 2559 if ((count & mask) != 0) { 2560 for (int j = accentsindex[i]; j < accentsindex[i + 1]; 2561 j ++) { 2562 m_canonicalPrefixAccents_.append(accents.charAt(j)); 2563 } 2564 } 2565 } 2566 offset = doPreviousCanonicalPrefixMatch(baseoffset); 2567 if (offset != DONE) { 2568 return true; } 2570 count --; 2571 } 2572 return false; 2573 } 2574 2575 2583 private boolean checkPreviousCanonicalContractionMatch(int start, int end) 2584 { 2585 int temp = end; 2586 char echar = 0; 2589 char schar = 0; 2590 if (end < m_textLimitOffset_) { 2591 targetText.setIndex(end); 2592 echar = targetText.current(); 2593 } 2594 if (start + 1 < m_textLimitOffset_) { 2595 targetText.setIndex(start + 1); 2596 schar = targetText.current(); 2597 } 2598 if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) { 2599 int expansion = m_colEIter_.m_CEBufferSize_ 2600 - m_colEIter_.m_CEBufferOffset_; 2601 boolean hasExpansion = expansion > 0; 2602 m_colEIter_.setExactOffset(end); 2603 while (expansion > 0) { 2604 m_colEIter_.previous(); 2612 if (m_colEIter_.getOffset() != temp) { 2613 end = temp; 2614 temp = m_colEIter_.getOffset(); 2615 } 2616 expansion --; 2617 } 2618 2619 int count = m_pattern_.m_CELength_; 2620 while (count > 0) { 2621 int ce = getCE(m_colEIter_.previous()); 2622 if (ce == CollationElementIterator.IGNORABLE) { 2625 continue; 2626 } 2627 if (hasExpansion && count == 0 2628 && m_colEIter_.getOffset() != temp) { 2629 end = temp; 2630 temp = m_colEIter_.getOffset(); 2631 } 2632 if (count == m_pattern_.m_CELength_ 2633 && ce != m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1]) { 2634 int expected = m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1]; 2637 targetText.setIndex(end); 2638 if (UTF16.isTrailSurrogate(targetText.previous())) { 2639 if (targetText.getIndex() > m_textBeginOffset_ && 2640 !UTF16.isLeadSurrogate(targetText.previous())) { 2641 targetText.next(); 2642 } 2643 } 2644 end = targetText.getIndex(); 2645 if ((getFCD(targetText, end) & LAST_BYTE_MASK_) != 0) { 2646 ce = getCE(m_colEIter_.previous()); 2647 while (ce != expected 2648 && ce != CollationElementIterator.NULLORDER 2649 && m_colEIter_.getOffset() <= start) { 2650 ce = getCE(m_colEIter_.previous()); 2651 } 2652 } 2653 } 2654 if (ce != m_pattern_.m_CE_[count - 1]) { 2655 start --; 2656 start = getPreviousBaseOffset(start); 2657 m_utilBuffer_[0] = start; 2658 m_utilBuffer_[1] = end; 2659 return false; 2660 } 2661 count --; 2662 } 2663 } 2664 m_utilBuffer_[0] = start; 2665 m_utilBuffer_[1] = end; 2666 return true; 2667 } 2668 2669 2685 private boolean checkPreviousCanonicalMatch(int textoffset) 2686 { 2687 if (m_pattern_.m_hasSuffixAccents_ 2690 && m_canonicalSuffixAccents_.length() != 0 2691 || m_pattern_.m_hasPrefixAccents_ 2692 && m_canonicalPrefixAccents_.length() != 0) { 2693 m_matchedIndex_ = textoffset; 2694 matchLength = getNextBaseOffset(m_colEIter_.getOffset()) 2695 - textoffset; 2696 return true; 2697 } 2698 2699 int end = m_colEIter_.getOffset(); 2700 if (!checkPreviousCanonicalContractionMatch(textoffset, end)) { 2701 return false; 2703 } 2704 textoffset = m_utilBuffer_[0]; 2705 end = m_utilBuffer_[1]; 2706 end = getNextBaseOffset(end); 2707 if (checkRepeatedMatch(textoffset, end) 2709 || !isBreakUnit(textoffset, end) 2710 || !checkIdentical(textoffset, end)) { 2711 textoffset --; 2712 textoffset = getPreviousBaseOffset(textoffset); 2713 m_utilBuffer_[0] = textoffset; 2714 return false; 2715 } 2716 2717 m_matchedIndex_ = textoffset; 2718 matchLength = end - textoffset; 2719 return true; 2720 } 2721 2722 2727 private void handleNextExact(int start) 2728 { 2729 int textoffset = shiftForward(start, 2730 CollationElementIterator.NULLORDER, 2731 m_pattern_.m_CELength_); 2732 int targetce = CollationElementIterator.IGNORABLE; 2733 while (textoffset <= m_textLimitOffset_) { 2734 m_colEIter_.setExactOffset(textoffset); 2735 int patternceindex = m_pattern_.m_CELength_ - 1; 2736 boolean found = false; 2737 int lastce = CollationElementIterator.NULLORDER; 2738 2739 while (true) { 2740 targetce = m_colEIter_.previous(); 2744 if (targetce == CollationElementIterator.NULLORDER) { 2745 found = false; 2746 break; 2747 } 2748 targetce = getCE(targetce); 2749 if (targetce == CollationElementIterator.IGNORABLE && 2750 m_colEIter_.isInBuffer()) { 2751 continue; 2754 } 2755 if (lastce == CollationElementIterator.NULLORDER 2756 || lastce == CollationElementIterator.IGNORABLE) { 2757 lastce = targetce; 2758 } 2759 if (targetce == m_pattern_.m_CE_[patternceindex]) { 2760 found = true; 2762 break; 2763 } 2764 if (m_colEIter_.m_CEBufferOffset_ <= 0) { 2765 found = false; 2766 break; 2767 } 2768 } 2769 2770 while (found && patternceindex > 0) { 2771 targetce = m_colEIter_.previous(); 2772 if (targetce == CollationElementIterator.NULLORDER) { 2773 found = false; 2774 break; 2775 } 2776 targetce = getCE(targetce); 2777 if (targetce == CollationElementIterator.IGNORABLE) { 2778 continue; 2779 } 2780 2781 patternceindex --; 2782 found = found && targetce == m_pattern_.m_CE_[patternceindex]; 2783 } 2784 2785 if (!found) { 2786 textoffset = shiftForward(textoffset, lastce, patternceindex); 2787 patternceindex = m_pattern_.m_CELength_; 2789 continue; 2790 } 2791 2792 if (checkNextExactMatch(textoffset)) { 2793 return; 2795 } 2796 textoffset = m_utilBuffer_[0]; 2797 } 2798 setMatchNotFound(); 2799 } 2800 2801 2806 private void handleNextCanonical(int start) 2807 { 2808 boolean hasPatternAccents = 2809 m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_; 2810 2811 int textoffset = shiftForward(start, CollationElementIterator.NULLORDER, 2815 m_pattern_.m_CELength_); 2816 m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length()); 2817 m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length()); 2818 int targetce = CollationElementIterator.IGNORABLE; 2819 2820 while (textoffset <= m_textLimitOffset_) 2821 { 2822 m_colEIter_.setExactOffset(textoffset); 2823 int patternceindex = m_pattern_.m_CELength_ - 1; 2824 boolean found = false; 2825 int lastce = CollationElementIterator.NULLORDER; 2826 2827 while (true) { 2828 targetce = m_colEIter_.previous(); 2832 if (targetce == CollationElementIterator.NULLORDER) { 2833 found = false; 2834 break; 2835 } 2836 targetce = getCE(targetce); 2837 if (lastce == CollationElementIterator.NULLORDER 2838 || lastce == CollationElementIterator.IGNORABLE) { 2839 lastce = targetce; 2840 } 2841 if (targetce == m_pattern_.m_CE_[patternceindex]) { 2842 found = true; 2844 break; 2845 } 2846 if (m_colEIter_.m_CEBufferOffset_ <= 0) { 2847 found = false; 2848 break; 2849 } 2850 } 2851 2852 while (found && patternceindex > 0) { 2853 targetce = m_colEIter_.previous(); 2854 if (targetce == CollationElementIterator.NULLORDER) { 2855 found = false; 2856 break; 2857 } 2858 targetce = getCE(targetce); 2859 if (targetce == CollationElementIterator.IGNORABLE) { 2860 continue; 2861 } 2862 2863 patternceindex --; 2864 found = found && targetce == m_pattern_.m_CE_[patternceindex]; 2865 } 2866 2867 if (hasPatternAccents && !found) { 2869 found = doNextCanonicalMatch(textoffset); 2870 } 2871 2872 if (!found) { 2873 textoffset = shiftForward(textoffset, lastce, patternceindex); 2874 patternceindex = m_pattern_.m_CELength_; 2876 continue; 2877 } 2878 2879 if (checkNextCanonicalMatch(textoffset)) { 2880 return; 2881 } 2882 textoffset = m_utilBuffer_[0]; 2883 } 2884 setMatchNotFound(); 2885 } 2886 2887 2892 private void handlePreviousExact(int start) 2893 { 2894 int textoffset = reverseShift(start, CollationElementIterator.NULLORDER, 2895 m_pattern_.m_CELength_); 2896 while (textoffset >= m_textBeginOffset_) 2897 { 2898 m_colEIter_.setExactOffset(textoffset); 2899 int patternceindex = 1; 2900 int targetce = CollationElementIterator.IGNORABLE; 2901 boolean found = false; 2902 int firstce = CollationElementIterator.NULLORDER; 2903 2904 while (true) { 2905 targetce = m_colEIter_.next(); 2910 if (targetce == CollationElementIterator.NULLORDER) { 2911 found = false; 2912 break; 2913 } 2914 targetce = getCE(targetce); 2915 if (firstce == CollationElementIterator.NULLORDER 2916 || firstce == CollationElementIterator.IGNORABLE) { 2917 firstce = targetce; 2918 } 2919 if (targetce == CollationElementIterator.IGNORABLE) { 2920 continue; 2921 } 2922 if (targetce == m_pattern_.m_CE_[0]) { 2923 found = true; 2924 break; 2925 } 2926 if (m_colEIter_.m_CEBufferOffset_ == -1 2927 || m_colEIter_.m_CEBufferOffset_ 2928 == m_colEIter_.m_CEBufferSize_) { 2929 found = false; 2931 break; 2932 } 2933 } 2934 2935 targetce = firstce; 2936 2937 while (found && patternceindex < m_pattern_.m_CELength_) { 2938 targetce = m_colEIter_.next(); 2939 if (targetce == CollationElementIterator.NULLORDER) { 2940 found = false; 2941 break; 2942 } 2943 targetce = getCE(targetce); 2944 if (targetce == CollationElementIterator.IGNORABLE) { 2945 continue; 2946 } 2947 2948 found = found && targetce == m_pattern_.m_CE_[patternceindex]; 2949 patternceindex ++; 2950 } 2951 2952 if (!found) { 2953 textoffset = reverseShift(textoffset, targetce, patternceindex); 2954 patternceindex = 0; 2955 continue; 2956 } 2957 2958 if (checkPreviousExactMatch(textoffset)) { 2959 return; 2960 } 2961 textoffset = m_utilBuffer_[0]; 2962 } 2963 setMatchNotFound(); 2964 } 2965 2966 2971 private void handlePreviousCanonical(int start) 2972 { 2973 boolean hasPatternAccents = 2974 m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_; 2975 2976 int textoffset = reverseShift(start, CollationElementIterator.NULLORDER, 2980 m_pattern_.m_CELength_); 2981 m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length()); 2982 m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length()); 2983 2984 while (textoffset >= m_textBeginOffset_) 2985 { 2986 m_colEIter_.setExactOffset(textoffset); 2987 int patternceindex = 1; 2988 int targetce = CollationElementIterator.IGNORABLE; 2989 boolean found = false; 2990 int firstce = CollationElementIterator.NULLORDER; 2991 2992 while (true) { 2993 targetce = m_colEIter_.next(); 2998 if (targetce == CollationElementIterator.NULLORDER) { 2999 found = false; 3000 break; 3001 } 3002 targetce = getCE(targetce); 3003 if (firstce == CollationElementIterator.NULLORDER 3004 || firstce == CollationElementIterator.IGNORABLE) { 3005 firstce = targetce; 3006 } 3007 3008 if (targetce == m_pattern_.m_CE_[0]) { 3009 found = true; 3011 break; 3012 } 3013 if (m_colEIter_.m_CEBufferOffset_ == -1 3014 || m_colEIter_.m_CEBufferOffset_ 3015 == m_colEIter_.m_CEBufferSize_) { 3016 found = false; 3018 break; 3019 } 3020 } 3021 3022 targetce = firstce; 3023 3024 while (found && patternceindex < m_pattern_.m_CELength_) { 3025 targetce = m_colEIter_.next(); 3026 if (targetce == CollationElementIterator.NULLORDER) { 3027 found = false; 3028 break; 3029 } 3030 targetce = getCE(targetce); 3031 if (targetce == CollationElementIterator.IGNORABLE) { 3032 continue; 3033 } 3034 3035 found = found && targetce == m_pattern_.m_CE_[patternceindex]; 3036 patternceindex ++; 3037 } 3038 3039 if (hasPatternAccents && !found) { 3041 found = doPreviousCanonicalMatch(textoffset); 3042 } 3043 3044 if (!found) { 3045 textoffset = reverseShift(textoffset, targetce, patternceindex); 3046 patternceindex = 0; 3047 continue; 3048 } 3049 3050 if (checkPreviousCanonicalMatch(textoffset)) { 3051 return; 3052 } 3053 textoffset = m_utilBuffer_[0]; 3054 } 3055 setMatchNotFound(); 3056 } 3057 3058 3065 private static final String getString(CharacterIterator text, int start, 3066 int length) 3067 { 3068 StringBuffer result = new StringBuffer (length); 3069 int offset = text.getIndex(); 3070 text.setIndex(start); 3071 for (int i = 0; i < length; i ++) { 3072 result.append(text.current()); 3073 text.next(); 3074 } 3075 text.setIndex(offset); 3076 return result.toString(); 3077 } 3078 3079 3084 private static final int getMask(int strength) 3085 { 3086 switch (strength) 3087 { 3088 case Collator.PRIMARY: 3089 return RuleBasedCollator.CE_PRIMARY_MASK_; 3090 case Collator.SECONDARY: 3091 return RuleBasedCollator.CE_SECONDARY_MASK_ 3092 | RuleBasedCollator.CE_PRIMARY_MASK_; 3093 default: 3094 return RuleBasedCollator.CE_TERTIARY_MASK_ 3095 | RuleBasedCollator.CE_SECONDARY_MASK_ 3096 | RuleBasedCollator.CE_PRIMARY_MASK_; 3097 } 3098 } 3099 3100 3103 private void setMatchNotFound() 3104 { 3105 m_matchedIndex_ = DONE; 3107 setMatchLength(0); 3108 } 3109} 3110 | Popular Tags |