1 package com.thaiopensource.datatype.xsd.regex.jdk1_4; 2 3 import com.thaiopensource.util.Utf16; 4 import com.thaiopensource.util.Localizer; 5 import com.thaiopensource.datatype.xsd.regex.RegexSyntaxException; 6 7 import java.util.Collections ; 8 import java.util.Iterator ; 9 import java.util.List ; 10 import java.util.Vector ; 11 import java.math.BigDecimal ; 12 13 19 public class Translator { 20 private final String regExp; 21 private int pos = 0; 22 private final int length; 23 private char curChar; 24 private boolean eos = false; 25 private final StringBuffer result = new StringBuffer (); 26 27 static private final String categories = "LMNPZSC"; 28 static private final CharClass[] categoryCharClasses = new CharClass[categories.length()]; 29 static private final String subCategories = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoZsZlZpSmScSkSoCcCfCoCn"; 30 static private final CharClass[] subCategoryCharClasses = new CharClass[subCategories.length() / 2]; 31 32 static private final int NONBMP_MIN = 0x10000; 33 static private final int NONBMP_MAX = 0x10FFFF; 34 static private final char SURROGATE2_MIN = '\uDC00'; 35 static private final char SURROGATE2_MAX = '\uDFFF'; 36 37 static final Localizer localizer = new Localizer(Translator.class); 38 39 static private final String [] blockNames = { 40 "BasicLatin", 41 "Latin-1Supplement", 42 "LatinExtended-A", 43 "LatinExtended-B", 44 "IPAExtensions", 45 "SpacingModifierLetters", 46 "CombiningDiacriticalMarks", 47 "Greek", 48 "Cyrillic", 49 "Armenian", 50 "Hebrew", 51 "Arabic", 52 "Syriac", 53 "Thaana", 54 "Devanagari", 55 "Bengali", 56 "Gurmukhi", 57 "Gujarati", 58 "Oriya", 59 "Tamil", 60 "Telugu", 61 "Kannada", 62 "Malayalam", 63 "Sinhala", 64 "Thai", 65 "Lao", 66 "Tibetan", 67 "Myanmar", 68 "Georgian", 69 "HangulJamo", 70 "Ethiopic", 71 "Cherokee", 72 "UnifiedCanadianAboriginalSyllabics", 73 "Ogham", 74 "Runic", 75 "Khmer", 76 "Mongolian", 77 "LatinExtendedAdditional", 78 "GreekExtended", 79 "GeneralPunctuation", 80 "SuperscriptsandSubscripts", 81 "CurrencySymbols", 82 "CombiningMarksforSymbols", 83 "LetterlikeSymbols", 84 "NumberForms", 85 "Arrows", 86 "MathematicalOperators", 87 "MiscellaneousTechnical", 88 "ControlPictures", 89 "OpticalCharacterRecognition", 90 "EnclosedAlphanumerics", 91 "BoxDrawing", 92 "BlockElements", 93 "GeometricShapes", 94 "MiscellaneousSymbols", 95 "Dingbats", 96 "BraillePatterns", 97 "CJKRadicalsSupplement", 98 "KangxiRadicals", 99 "IdeographicDescriptionCharacters", 100 "CJKSymbolsandPunctuation", 101 "Hiragana", 102 "Katakana", 103 "Bopomofo", 104 "HangulCompatibilityJamo", 105 "Kanbun", 106 "BopomofoExtended", 107 "EnclosedCJKLettersandMonths", 108 "CJKCompatibility", 109 "CJKUnifiedIdeographsExtensionA", 110 "CJKUnifiedIdeographs", 111 "YiSyllables", 112 "YiRadicals", 113 "HangulSyllables", 114 "CJKCompatibilityIdeographs", 117 "AlphabeticPresentationForms", 118 "ArabicPresentationForms-A", 119 "CombiningHalfMarks", 120 "CJKCompatibilityForms", 121 "SmallFormVariants", 122 "ArabicPresentationForms-B", 123 "Specials", 124 "HalfwidthandFullwidthForms", 125 "Specials" 126 }; 127 128 129 132 static private final String [] specialBlockNames = { 133 "OldItalic", 134 "Gothic", 135 "Deseret", 136 "ByzantineMusicalSymbols", 137 "MusicalSymbols", 138 "MathematicalAlphanumericSymbols", 139 "CJKUnifiedIdeographsExtensionB", 140 "CJKCompatibilityIdeographsSupplement", 141 "Tags", 142 "PrivateUse", 143 "HighSurrogates", 144 "HighPrivateUseSurrogates", 145 "LowSurrogates", 146 }; 147 148 151 static private final CharClass[] specialBlockCharClasses = { 152 new CharRange(0x10300, 0x1032F), 153 new CharRange(0x10330, 0x1034F), 154 new CharRange(0x10400, 0x1044F), 155 new CharRange(0x1D000, 0x1D0FF), 156 new CharRange(0x1D100, 0x1D1FF), 157 new CharRange(0x1D400, 0x1D7FF), 158 new CharRange(0x20000, 0x2A6D6), 159 new CharRange(0x2F800, 0x2FA1F), 160 new CharRange(0xE0000, 0xE007F), 161 new Union(new CharClass[] { 162 new CharRange(0xE000, 0xF8FF), 163 new CharRange(0xF0000, 0xFFFFD), 164 new CharRange(0x100000, 0x10FFFD) 165 }), 166 Empty.getInstance(), 167 Empty.getInstance(), 168 Empty.getInstance() 169 }; 170 171 static private final CharClass DOT = new Complement(new Union(new CharClass[] { new SingleChar('\n'), new SingleChar('\r') })); 172 173 static private final CharClass ESC_d = new Property("Nd"); 174 175 static private final CharClass ESC_D = new Complement(ESC_d); 176 177 static private final CharClass ESC_W = new Union(new CharClass[] {new Property("P"), new Property("Z"), new Property("C")}); 178 179 static private final CharClass ESC_w = new Complement(ESC_W); 180 181 static private final CharClass ESC_s = new Union(new CharClass[] { 182 new SingleChar(' '), 183 new SingleChar('\n'), 184 new SingleChar('\r'), 185 new SingleChar('\t') 186 }); 187 188 static private final CharClass ESC_S = new Complement(ESC_s); 189 190 static private final CharClass ESC_i = makeCharClass(NamingExceptions.NMSTRT_CATEGORIES, 191 NamingExceptions.NMSTRT_INCLUDES, 192 NamingExceptions.NMSTRT_EXCLUDE_RANGES); 193 194 static private final CharClass ESC_I = new Complement(ESC_i); 195 196 static private final CharClass ESC_c = makeCharClass(NamingExceptions.NMCHAR_CATEGORIES, 197 NamingExceptions.NMCHAR_INCLUDES, 198 NamingExceptions.NMCHAR_EXCLUDE_RANGES); 199 200 static private final CharClass ESC_C = new Complement(ESC_c); 201 202 static private final char EOS = '\0'; 203 204 private Translator(String regExp) { 205 this.regExp = regExp; 206 this.length = regExp.length(); 207 advance(); 208 } 209 210 225 static public String translate(String regexp) throws RegexSyntaxException { 226 Translator tr = new Translator(regexp); 227 tr.translateTop(); 228 return tr.result.toString(); 229 } 230 231 private void advance() { 232 if (pos < length) 233 curChar = regExp.charAt(pos++); 234 else { 235 pos++; 236 curChar = EOS; 237 eos = true; 238 } 239 } 240 241 private void translateTop() throws RegexSyntaxException { 242 translateRegExp(); 243 if (!eos) 244 throw makeException("expected_eos"); 245 } 246 247 private void translateRegExp() throws RegexSyntaxException { 248 translateBranch(); 249 while (curChar == '|') { 250 copyCurChar(); 251 translateBranch(); 252 } 253 } 254 255 private void translateBranch() throws RegexSyntaxException { 256 while (translateAtom()) 257 translateQuantifier(); 258 } 259 260 private void translateQuantifier() throws RegexSyntaxException { 261 switch (curChar) { 262 case '*': 263 case '?': 264 case '+': 265 copyCurChar(); 266 return; 267 case '{': 268 copyCurChar(); 269 translateQuantity(); 270 expect('}'); 271 copyCurChar(); 272 } 273 } 274 275 private void translateQuantity() throws RegexSyntaxException { 276 String lower = parseQuantExact(); 277 int lowerValue = -1; 278 try { 279 lowerValue = Integer.parseInt(lower); 280 result.append(lower); 281 } 282 catch (NumberFormatException e) { 283 result.append(Integer.MAX_VALUE); 285 } 286 if (curChar == ',') { 287 copyCurChar(); 288 if (curChar != '}') { 289 String upper = parseQuantExact(); 290 try { 291 int upperValue = Integer.parseInt(upper); 292 result.append(upper); 293 if (lowerValue < 0 || upperValue < lowerValue) 294 throw makeException("invalid_quantity_range"); 295 } 296 catch (NumberFormatException e) { 297 result.append(Integer.MAX_VALUE); 298 if (lowerValue < 0 && new BigDecimal (lower).compareTo(new BigDecimal (upper)) > 0) 299 throw makeException("invalid_quantity_range"); 300 } 301 } 302 } 303 } 304 305 private String parseQuantExact() throws RegexSyntaxException { 306 StringBuffer buf = new StringBuffer (); 307 do { 308 if ("0123456789".indexOf(curChar) < 0) 309 throw makeException("expected_digit"); 310 buf.append(curChar); 311 advance(); 312 } while (curChar != ',' && curChar != '}'); 313 return buf.toString(); 314 } 315 316 private void copyCurChar() { 317 result.append(curChar); 318 advance(); 319 } 320 321 static final int NONE = -1; 322 static final int SOME = 0; 323 static final int ALL = 1; 324 325 static final String SURROGATES1_CLASS = "[\uD800-\uDBFF]"; 326 static final String SURROGATES2_CLASS = "[\uDC00-\uDFFF]"; 327 static final String NOT_ALLOWED_CLASS = "[\u0000&&[^\u0000]]"; 328 329 static final class Range implements Comparable { 330 private final int min; 331 private final int max; 332 333 Range(int min, int max) { 334 this.min = min; 335 this.max = max; 336 } 337 338 int getMin() { 339 return min; 340 } 341 342 int getMax() { 343 return max; 344 } 345 346 public int compareTo(Object o) { 347 Range other = (Range)o; 348 if (this.min < other.min) 349 return -1; 350 if (this.min > other.min) 351 return 1; 352 if (this.max > other.max) 353 return -1; 354 if (this.max < other.max) 355 return 1; 356 return 0; 357 } 358 } 359 360 static abstract class CharClass { 361 362 private final int containsBmp; 363 private final int containsNonBmp; 366 367 protected CharClass(int containsBmp, int containsNonBmp) { 368 this.containsBmp = containsBmp; 369 this.containsNonBmp = containsNonBmp; 370 } 371 372 int getContainsBmp() { 373 return containsBmp; 374 } 375 376 int getContainsNonBmp() { 377 return containsNonBmp; 378 } 379 380 final void output(StringBuffer buf) { 381 switch (containsNonBmp) { 382 case NONE: 383 if (containsBmp == NONE) 384 buf.append(NOT_ALLOWED_CLASS); 385 else 386 outputBmp(buf); 387 break; 388 case ALL: 389 buf.append('('); 390 if (containsBmp == NONE) { 391 buf.append(SURROGATES1_CLASS); 392 buf.append(SURROGATES2_CLASS); 393 } 394 else { 395 outputBmp(buf); 396 buf.append(SURROGATES2_CLASS); 397 buf.append('?'); 398 } 399 buf.append(')'); 400 break; 401 case SOME: 402 buf.append('('); 403 boolean needSep = false; 404 if (containsBmp != NONE) { 405 needSep = true; 406 outputBmp(buf); 407 } 408 List ranges = new Vector (); 409 addNonBmpRanges(ranges); 410 sortRangeList(ranges); 411 String hi = highSurrogateRanges(ranges); 412 if (hi.length() > 0) { 413 if (needSep) 414 buf.append('|'); 415 else 416 needSep = true; 417 buf.append('['); 418 for (int i = 0, len = hi.length(); i < len; i += 2) { 419 char min = hi.charAt(i); 420 char max = hi.charAt(i + 1); 421 if (min == max) 422 buf.append(min); 423 else { 424 buf.append(min); 425 buf.append('-'); 426 buf.append(max); 427 } 428 } 429 buf.append(']'); 430 buf.append(SURROGATES2_CLASS); 431 } 432 String lo = lowSurrogateRanges(ranges); 433 for (int i = 0, len = lo.length(); i < len; i += 3) { 434 if (needSep) 435 buf.append('|'); 436 else 437 needSep = true; 438 buf.append(lo.charAt(i)); 439 char min = lo.charAt(i + 1); 440 char max = lo.charAt(i + 2); 441 if (min == max && (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i))) 442 buf.append(min); 443 else { 444 buf.append('['); 445 for (;;) { 446 if (min == max) 447 buf.append(min); 448 else { 449 buf.append(min); 450 buf.append('-'); 451 buf.append(max); 452 } 453 if (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i)) 454 break; 455 i += 3; 456 min = lo.charAt(i + 1); 457 max = lo.charAt(i + 2); 458 } 459 buf.append(']'); 460 } 461 } 462 if (!needSep) 463 buf.append(NOT_ALLOWED_CLASS); 464 buf.append(')'); 465 break; 466 } 467 } 468 469 static String highSurrogateRanges(List ranges) { 470 StringBuffer highRanges = new StringBuffer (); 471 for (int i = 0, len = ranges.size(); i < len; i++) { 472 Range r = (Range)ranges.get(i); 473 char min1 = Utf16.surrogate1(r.getMin()); 474 char min2 = Utf16.surrogate2(r.getMin()); 475 char max1 = Utf16.surrogate1(r.getMax()); 476 char max2 = Utf16.surrogate2(r.getMax()); 477 if (min2 != SURROGATE2_MIN) 478 min1++; 479 if (max2 != SURROGATE2_MAX) 480 max1--; 481 if (max1 >= min1) { 482 highRanges.append(min1); 483 highRanges.append(max1); 484 } 485 } 486 return highRanges.toString(); 487 } 488 489 static String lowSurrogateRanges(List ranges) { 490 StringBuffer lowRanges = new StringBuffer (); 491 for (int i = 0, len = ranges.size(); i < len; i++) { 492 Range r = (Range)ranges.get(i); 493 char min1 = Utf16.surrogate1(r.getMin()); 494 char min2 = Utf16.surrogate2(r.getMin()); 495 char max1 = Utf16.surrogate1(r.getMax()); 496 char max2 = Utf16.surrogate2(r.getMax()); 497 if (min1 == max1) { 498 if (min2 != SURROGATE2_MIN || max2 != SURROGATE2_MAX) { 499 lowRanges.append(min1); 500 lowRanges.append(min2); 501 lowRanges.append(max2); 502 } 503 } 504 else { 505 if (min2 != SURROGATE2_MIN) { 506 lowRanges.append(min1); 507 lowRanges.append(min2); 508 lowRanges.append(SURROGATE2_MAX); 509 } 510 if (max2 != SURROGATE2_MAX) { 511 lowRanges.append(max1); 512 lowRanges.append(SURROGATE2_MIN); 513 lowRanges.append(max2); 514 } 515 } 516 } 517 return lowRanges.toString(); 518 } 519 520 abstract void outputBmp(StringBuffer buf); 521 abstract void outputComplementBmp(StringBuffer buf); 522 523 int singleChar() { 524 return -1; 525 } 526 527 void addNonBmpRanges(List ranges) { 528 } 529 530 531 static void sortRangeList(List ranges) { 532 Collections.sort(ranges); 533 int toIndex = 0; 534 int fromIndex = 0; 535 int len = ranges.size(); 536 while (fromIndex < len) { 537 Range r = (Range)ranges.get(fromIndex); 538 int min = r.getMin(); 539 int max = r.getMax(); 540 while (++fromIndex < len) { 541 Range r2 = (Range)ranges.get(fromIndex); 542 if (r2.getMin() > max + 1) 543 break; 544 if (r2.getMax() > max) 545 max = r2.getMax(); 546 } 547 if (max != r.getMax()) 548 r = new Range(min, max); 549 ranges.set(toIndex++, r); 550 } 551 while (len > toIndex) 552 ranges.remove(--len); 553 } 554 555 } 556 557 static abstract class SimpleCharClass extends CharClass { 558 SimpleCharClass(int containsBmp, int containsNonBmp) { 559 super(containsBmp, containsNonBmp); 560 } 561 562 void outputBmp(StringBuffer buf) { 563 buf.append('['); 564 inClassOutputBmp(buf); 565 buf.append(']'); 566 } 567 568 void outputComplementBmp(StringBuffer buf) { 570 if (getContainsBmp() == NONE) 571 buf.append("[\u0000-\uFFFF]"); 572 else { 573 buf.append("[^"); 574 inClassOutputBmp(buf); 575 buf.append(']'); 576 } 577 } 578 abstract void inClassOutputBmp(StringBuffer buf); 579 } 580 581 static class SingleChar extends SimpleCharClass { 582 private final char c; 583 SingleChar(char c) { 584 super(SOME, NONE); 585 this.c = c; 586 } 587 588 int singleChar() { 589 return c; 590 } 591 592 void outputBmp(StringBuffer buf) { 593 inClassOutputBmp(buf); 594 } 595 596 void inClassOutputBmp(StringBuffer buf) { 597 if (isJavaMetaChar(c)) 598 buf.append('\\'); 599 buf.append(c); 600 } 601 602 } 603 604 static class WideSingleChar extends SimpleCharClass { 605 private final int c; 606 607 WideSingleChar(int c) { 608 super(NONE, SOME); 609 this.c = c; 610 } 611 612 void inClassOutputBmp(StringBuffer buf) { 613 throw new RuntimeException ("BMP output botch"); 614 } 615 616 int singleChar() { 617 return c; 618 } 619 620 void addNonBmpRanges(List ranges) { 621 ranges.add(new Range(c, c)); 622 } 623 } 624 625 static class Empty extends SimpleCharClass { 626 static private final Empty instance = new Empty(); 627 private Empty() { 628 super(NONE, NONE); 629 } 630 631 static Empty getInstance() { 632 return instance; 633 } 634 635 void inClassOutputBmp(StringBuffer buf) { 636 throw new RuntimeException ("BMP output botch"); 637 } 638 639 } 640 641 static class CharRange extends SimpleCharClass { 642 private final int lower; 643 private final int upper; 644 645 CharRange(int lower, int upper) { 646 super(lower < NONBMP_MIN ? SOME : NONE, 647 upper >= NONBMP_MIN ? SOME : NONE); 649 this.lower = lower; 650 this.upper = upper; 651 } 652 653 void inClassOutputBmp(StringBuffer buf) { 654 if (lower >= NONBMP_MIN) 655 throw new RuntimeException ("BMP output botch"); 656 if (isJavaMetaChar((char)lower)) 657 buf.append('\\'); 658 buf.append((char)lower); 659 buf.append('-'); 660 if (upper < NONBMP_MIN) { 661 if (isJavaMetaChar((char)upper)) 662 buf.append('\\'); 663 buf.append((char)upper); 664 } 665 else 666 buf.append('\uFFFF'); 667 } 668 669 void addNonBmpRanges(List ranges) { 670 if (upper >= NONBMP_MIN) 671 ranges.add(new Range(lower < NONBMP_MIN ? NONBMP_MIN : lower, upper)); 672 } 673 } 674 675 static class Property extends SimpleCharClass { 676 private final String name; 677 678 Property(String name) { 679 super(SOME, NONE); 680 this.name = name; 681 } 682 683 void outputBmp(StringBuffer buf) { 684 inClassOutputBmp(buf); 685 } 686 687 void inClassOutputBmp(StringBuffer buf) { 688 buf.append("\\p{"); 689 buf.append(name); 690 buf.append('}'); 691 } 692 693 void outputComplementBmp(StringBuffer buf) { 694 buf.append("\\P{"); 695 buf.append(name); 696 buf.append('}'); 697 } 698 } 699 700 static class Subtraction extends CharClass { 701 private final CharClass cc1; 702 private final CharClass cc2; 703 Subtraction(CharClass cc1, CharClass cc2) { 704 super(Math.min(cc1.getContainsBmp(), -cc2.getContainsBmp()), 707 Math.min(cc1.getContainsNonBmp(), -cc2.getContainsNonBmp())); 708 this.cc1 = cc1; 709 this.cc2 = cc2; 710 } 711 712 void outputBmp(StringBuffer buf) { 713 buf.append('['); 714 cc1.outputBmp(buf); 715 buf.append("&&"); 716 cc2.outputComplementBmp(buf); 717 buf.append(']'); 718 } 719 720 void outputComplementBmp(StringBuffer buf) { 721 buf.append('['); 722 cc1.outputComplementBmp(buf); 723 cc2.outputBmp(buf); 724 buf.append(']'); 725 } 726 727 void addNonBmpRanges(List ranges) { 728 List posList = new Vector (); 729 cc1.addNonBmpRanges(posList); 730 List negList = new Vector (); 731 cc2.addNonBmpRanges(negList); 732 sortRangeList(posList); 733 sortRangeList(negList); 734 Iterator negIter = negList.iterator(); 735 Range negRange; 736 if (negIter.hasNext()) 737 negRange = (Range)negIter.next(); 738 else 739 negRange = null; 740 for (int i = 0, len = posList.size(); i < len; i++) { 741 Range posRange = (Range)posList.get(i); 742 while (negRange != null && negRange.getMax() < posRange.getMin()) { 743 if (negIter.hasNext()) 744 negRange = (Range)negIter.next(); 745 else 746 negRange = null; 747 } 748 int min = posRange.getMin(); 750 while (negRange != null && negRange.getMin() <= posRange.getMax()) { 751 if (min < negRange.getMin()) { 752 ranges.add(new Range(min, negRange.getMin() - 1)); 753 } 754 min = negRange.getMax() + 1; 755 if (min > posRange.getMax()) 756 break; 757 if (negIter.hasNext()) 758 negRange = (Range)negIter.next(); 759 else 760 negRange = null; 761 } 762 if (min <= posRange.getMax()) 763 ranges.add(new Range(min, posRange.getMax())); 764 } 765 } 766 } 767 768 static class Union extends CharClass { 769 private final List members; 770 771 Union(CharClass[] v) { 772 this(toList(v)); 773 } 774 775 static private List toList(CharClass[] v) { 776 List members = new Vector (); 777 for (int i = 0; i < v.length; i++) 778 members.add(v[i]); 779 return members; 780 } 781 782 Union(List members) { 783 super(computeContainsBmp(members), computeContainsNonBmp(members)); 784 this.members = members; 785 } 786 787 void outputBmp(StringBuffer buf) { 788 buf.append('['); 789 for (int i = 0, len = members.size(); i < len; i++) { 790 CharClass cc = (CharClass)members.get(i); 791 if (cc.getContainsBmp() != NONE) { 792 if (cc instanceof SimpleCharClass) 793 ((SimpleCharClass)cc).inClassOutputBmp(buf); 794 else 795 cc.outputBmp(buf); 796 } 797 } 798 buf.append(']'); 799 } 800 801 void outputComplementBmp(StringBuffer buf) { 802 boolean first = true; 803 int len = members.size(); 804 for (int i = 0; i < len; i++) { 805 CharClass cc = (CharClass)members.get(i); 806 if (cc.getContainsBmp() != NONE && cc instanceof SimpleCharClass) { 807 if (first) { 808 buf.append("[^"); 809 first = false; 810 } 811 ((SimpleCharClass)cc).inClassOutputBmp(buf); 812 } 813 } 814 for (int i = 0; i < len; i++) { 815 CharClass cc = (CharClass)members.get(i); 816 if (cc.getContainsBmp() != NONE && !(cc instanceof SimpleCharClass)) { 817 if (first) { 818 buf.append('['); 819 first = false; 820 } 821 else 822 buf.append("&&"); 823 cc.outputComplementBmp(buf); 826 } 827 } 828 if (first == true) 829 buf.append("[\u0000-\uFFFF]"); 831 else 832 buf.append(']'); 833 } 834 835 void addNonBmpRanges(List ranges) { 836 for (int i = 0, len = members.size(); i < len; i++) 837 ((CharClass)members.get(i)).addNonBmpRanges(ranges); 838 } 839 840 private static int computeContainsBmp(List members) { 841 int ret = NONE; 842 for (int i = 0, len = members.size(); i < len; i++) 843 ret = Math.max(ret, ((CharClass)members.get(i)).getContainsBmp()); 844 return ret; 845 } 846 847 private static int computeContainsNonBmp(List members) { 848 int ret = NONE; 849 for (int i = 0, len = members.size(); i < len; i++) 850 ret = Math.max(ret, ((CharClass)members.get(i)).getContainsNonBmp()); 851 return ret; 852 } 853 } 854 855 static class Complement extends CharClass { 856 private final CharClass cc; 857 Complement(CharClass cc) { 858 super(-cc.getContainsBmp(), -cc.getContainsNonBmp()); 859 this.cc = cc; 860 } 861 862 void outputBmp(StringBuffer buf) { 863 cc.outputComplementBmp(buf); 864 } 865 866 void outputComplementBmp(StringBuffer buf) { 867 cc.outputBmp(buf); 868 } 869 870 void addNonBmpRanges(List ranges) { 871 List tem = new Vector (); 872 cc.addNonBmpRanges(tem); 873 sortRangeList(tem); 874 int c = NONBMP_MIN; 875 for (int i = 0, len = tem.size(); i < len; i++) { 876 Range r = (Range)tem.get(i); 877 if (r.getMin() > c) 878 ranges.add(new Range(c, r.getMin() - 1)); 879 c = r.getMax() + 1; 880 } 881 if (c != NONBMP_MAX + 1) 882 ranges.add(new Range(c, NONBMP_MAX)); 883 } 884 } 885 886 private boolean translateAtom() throws RegexSyntaxException { 887 switch (curChar) { 888 case EOS: 889 if (!eos) 890 break; 891 case '?': 893 case '*': 894 case '+': 895 case ')': 896 case '{': 897 case '}': 898 case '|': 899 case ']': 900 return false; 901 case '(': 902 copyCurChar(); 903 translateRegExp(); 904 expect(')'); 905 copyCurChar(); 906 return true; 907 case '\\': 908 advance(); 909 parseEsc().output(result); 910 return true; 911 case '[': 912 advance(); 913 parseCharClassExpr().output(result); 914 return true; 915 case '.': 916 DOT.output(result); 917 advance(); 918 return true; 919 case '$': 920 case '^': 921 result.append('\\'); 922 break; 923 } 924 copyCurChar(); 925 return true; 926 } 927 928 929 static private CharClass makeCharClass(String categories, String includes, String excludeRanges) { 930 List includeList = new Vector (); 931 for (int i = 0, len = categories.length(); i < len; i += 2) 932 includeList.add(new Property(categories.substring(i, i + 2))); 933 for (int i = 0, len = includes.length(); i < len; i++) { 934 int j = i + 1; 935 for (; j < len && includes.charAt(j) - includes.charAt(i) == j - i; j++) 936 ; 937 --j; 938 if (i == j - 1) 939 --j; 940 if (i == j) 941 includeList.add(new SingleChar(includes.charAt(i))); 942 else 943 includeList.add(new CharRange(includes.charAt(i), includes.charAt(j))); 944 i = j; 945 } 946 List excludeList = new Vector (); 947 for (int i = 0, len = excludeRanges.length(); i < len; i += 2) { 948 char min = excludeRanges.charAt(i); 949 char max = excludeRanges.charAt(i + 1); 950 if (min == max) 951 excludeList.add(new SingleChar(min)); 952 else if (min == max - 1) { 953 excludeList.add(new SingleChar(min)); 954 excludeList.add(new SingleChar(max)); 955 } 956 else 957 excludeList.add(new CharRange(min, max)); 958 } 959 return new Subtraction(new Union(includeList), new Union(excludeList)); 960 } 961 962 private CharClass parseEsc() throws RegexSyntaxException { 963 switch (curChar) { 964 case 'n': 965 advance(); 966 return new SingleChar('\n'); 967 case 'r': 968 advance(); 969 return new SingleChar('\r'); 970 case 't': 971 advance(); 972 return new SingleChar('\t'); 973 case '\\': 974 case '|': 975 case '.': 976 case '-': 977 case '^': 978 case '?': 979 case '*': 980 case '+': 981 case '(': 982 case ')': 983 case '{': 984 case '}': 985 case '[': 986 case ']': 987 break; 988 case 's': 989 advance(); 990 return ESC_s; 991 case 'S': 992 advance(); 993 return ESC_S; 994 case 'i': 995 advance(); 996 return ESC_i; 997 case 'I': 998 advance(); 999 return ESC_I; 1000 case 'c': 1001 advance(); 1002 return ESC_c; 1003 case 'C': 1004 advance(); 1005 return ESC_C; 1006 case 'd': 1007 advance(); 1008 return ESC_d; 1009 case 'D': 1010 advance(); 1011 return ESC_D; 1012 case 'w': 1013 advance(); 1014 return ESC_w; 1015 case 'W': 1016 advance(); 1017 return ESC_W; 1018 case 'p': 1019 advance(); 1020 return parseProp(); 1021 case 'P': 1022 advance(); 1023 return new Complement(parseProp()); 1024 default: 1025 throw makeException("bad_escape"); 1026 } 1027 CharClass tem = new SingleChar(curChar); 1028 advance(); 1029 return tem; 1030 } 1031 1032 private CharClass parseProp() throws RegexSyntaxException { 1033 expect('{'); 1034 int start = pos; 1035 for (;;) { 1036 advance(); 1037 if (curChar == '}') 1038 break; 1039 if (!isAsciiAlnum(curChar) && curChar != '-') 1040 expect('}'); 1041 } 1042 String propertyName = regExp.substring(start, pos - 1); 1043 advance(); 1044 switch (propertyName.length()) { 1045 case 0: 1046 throw makeException("empty_property_name"); 1047 case 2: 1048 int sci = subCategories.indexOf(propertyName); 1049 if (sci < 0 || sci % 2 == 1) 1050 throw makeException("bad_category"); 1051 return getSubCategoryCharClass(sci / 2); 1052 case 1: 1053 int ci = categories.indexOf(propertyName.charAt(0)); 1054 if (ci < 0) 1055 throw makeException("bad_category", propertyName); 1056 return getCategoryCharClass(ci); 1057 default: 1058 if (!propertyName.startsWith("Is")) 1059 break; 1060 String blockName = propertyName.substring(2); 1061 for (int i = 0; i < specialBlockNames.length; i++) 1062 if (blockName.equals(specialBlockNames[i])) 1063 return specialBlockCharClasses[i]; 1064 if (!isBlock(blockName)) 1065 throw makeException("bad_block_name", blockName); 1066 return new Property( "In" + blockName); 1067 } 1068 throw makeException("bad_property_name", propertyName); 1069 } 1070 1071 static private boolean isBlock(String name) { 1072 for (int i = 0; i < blockNames.length; i++) 1073 if (name.equals(blockNames[i])) 1074 return true; 1075 return false; 1076 } 1077 1078 static private boolean isAsciiAlnum(char c) { 1079 if ('a' <= c && c <= 'z') 1080 return true; 1081 if ('A' <= c && c <= 'Z') 1082 return true; 1083 if ('0' <= c && c <= '9') 1084 return true; 1085 return false; 1086 } 1087 1088 private void expect(char c) throws RegexSyntaxException { 1089 if (curChar != c) 1090 throw makeException("expected", new String (new char[]{c})); 1091 } 1092 1093 private CharClass parseCharClassExpr() throws RegexSyntaxException { 1094 boolean compl; 1095 if (curChar == '^') { 1096 advance(); 1097 compl = true; 1098 } 1099 else 1100 compl = false; 1101 List members = new Vector (); 1102 do { 1103 CharClass lower = parseCharClassEscOrXmlChar(); 1104 members.add(lower); 1105 if (curChar == '-') { 1106 advance(); 1107 if (curChar == '[') 1108 break; 1109 CharClass upper = parseCharClassEscOrXmlChar(); 1110 if (lower.singleChar() < 0 || upper.singleChar() < 0) 1111 throw makeException("multi_range"); 1112 if (lower.singleChar() > upper.singleChar()) 1113 throw makeException("invalid_range"); 1114 members.set(members.size() - 1, 1115 new CharRange(lower.singleChar(), upper.singleChar())); 1116 if (curChar == '-') { 1117 advance(); 1118 expect('['); 1119 break; 1120 } 1121 } 1122 } while (curChar != ']'); 1123 CharClass result; 1124 if (members.size() == 1) 1125 result = (CharClass)members.get(0); 1126 else 1127 result = new Union(members); 1128 if (compl) 1129 result = new Complement(result); 1130 if (curChar == '[') { 1131 advance(); 1132 result = new Subtraction(result, parseCharClassExpr()); 1133 expect(']'); 1134 } 1135 advance(); 1136 return result; 1137 } 1138 1139 private CharClass parseCharClassEscOrXmlChar() throws RegexSyntaxException { 1140 switch (curChar) { 1141 case EOS: 1142 if (eos) 1143 expect(']'); 1144 break; 1145 case '\\': 1146 advance(); 1147 return parseEsc(); 1148 case '[': 1149 case ']': 1150 case '-': 1151 throw makeException("should_quote", new String (new char[]{curChar})); 1152 } 1153 CharClass tem; 1154 if (Utf16.isSurrogate(curChar)) { 1155 if (!Utf16.isSurrogate1(curChar)) 1156 throw makeException("invalid_surrogate"); 1157 char c1 = curChar; 1158 advance(); 1159 if (!Utf16.isSurrogate2(curChar)) 1160 throw makeException("invalid_surrogate"); 1161 tem = new WideSingleChar(Utf16.scalarValue(c1, curChar)); 1162 } 1163 else 1164 tem = new SingleChar(curChar); 1165 advance(); 1166 return tem; 1167 } 1168 1169 private RegexSyntaxException makeException(String key) { 1170 return new RegexSyntaxException(localizer.message(key), pos - 1); 1171 } 1172 1173 private RegexSyntaxException makeException(String key, String arg) { 1174 return new RegexSyntaxException(localizer.message(key, arg), pos - 1); 1175 } 1176 1177 static private boolean isJavaMetaChar(char c) { 1178 switch (c) { 1179 case '\\': 1180 case '^': 1181 case '?': 1182 case '*': 1183 case '+': 1184 case '(': 1185 case ')': 1186 case '{': 1187 case '}': 1188 case '|': 1189 case '[': 1190 case ']': 1191 case '-': 1192 case '&': 1193 case '$': 1194 case '.': 1195 return true; 1196 } 1197 return false; 1198 } 1199 1200 static private synchronized CharClass getCategoryCharClass(int ci) { 1201 if (categoryCharClasses[ci] == null) 1202 categoryCharClasses[ci] = computeCategoryCharClass(categories.charAt(ci)); 1203 return categoryCharClasses[ci]; 1204 } 1205 1206 static private synchronized CharClass getSubCategoryCharClass(int sci) { 1207 if (subCategoryCharClasses[sci] == null) 1208 subCategoryCharClasses[sci] = computeSubCategoryCharClass(subCategories.substring(sci * 2, (sci + 1) * 2)); 1209 return subCategoryCharClasses[sci]; 1210 } 1211 1212 static private final char UNICODE_3_1_ADD_Lu = '\u03F4'; static private final char UNICODE_3_1_ADD_Ll = '\u03F5'; static private final char UNICODE_3_1_CHANGE_No_to_Nl_MIN = '\u16EE'; 1216 static private final char UNICODE_3_1_CHANGE_No_to_Nl_MAX = '\u16F0'; 1217 static private final String CATEGORY_Pi = "\u00AB\u2018\u201B\u201C\u201F\u2039"; static private final String CATEGORY_Pf = "\u00BB\u2019\u201D\u203A"; 1220 static private CharClass computeCategoryCharClass(char code) { 1221 List classes = new Vector (); 1222 classes.add(new Property(new String (new char[] { code }))); 1223 for (int ci = Categories.CATEGORY_NAMES.indexOf(code); ci >= 0; ci = Categories.CATEGORY_NAMES.indexOf(code, ci + 1)) { 1224 int[] addRanges = Categories.CATEGORY_RANGES[ci/2]; 1225 for (int i = 0; i < addRanges.length; i += 2) 1226 classes.add(new CharRange(addRanges[i], addRanges[i + 1])); 1227 } 1228 if (code == 'P') 1229 classes.add(makeCharClass(CATEGORY_Pi + CATEGORY_Pf)); 1230 if (code == 'L') { 1231 classes.add(new SingleChar(UNICODE_3_1_ADD_Ll)); 1232 classes.add(new SingleChar(UNICODE_3_1_ADD_Lu)); 1233 } 1234 if (code == 'C') { 1235 classes.add(new Subtraction(new Property("Cn"), 1237 new Union(new CharClass[] { new SingleChar(UNICODE_3_1_ADD_Lu), 1238 new SingleChar(UNICODE_3_1_ADD_Ll) }))); 1239 List assignedRanges = new Vector (); 1240 for (int i = 0; i < Categories.CATEGORY_RANGES.length; i++) 1241 for (int j = 0; j < Categories.CATEGORY_RANGES[i].length; j += 2) 1242 assignedRanges.add(new CharRange(Categories.CATEGORY_RANGES[i][j], 1243 Categories.CATEGORY_RANGES[i][j + 1])); 1244 classes.add(new Subtraction(new CharRange(NONBMP_MIN, NONBMP_MAX), 1245 new Union(assignedRanges))); 1246 } 1247 if (classes.size() == 1) 1248 return (CharClass)classes.get(0); 1249 return new Union(classes); 1250 } 1251 1252 static private CharClass computeSubCategoryCharClass(String name) { 1253 CharClass base = new Property(name); 1254 int sci = Categories.CATEGORY_NAMES.indexOf(name); 1255 if (sci < 0) { 1256 if (name.equals("Cn")) { 1257 List assignedRanges = new Vector (); 1259 assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Lu)); 1260 assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Ll)); 1261 for (int i = 0; i < Categories.CATEGORY_RANGES.length; i++) 1262 for (int j = 0; j < Categories.CATEGORY_RANGES[i].length; j += 2) 1263 assignedRanges.add(new CharRange(Categories.CATEGORY_RANGES[i][j], 1264 Categories.CATEGORY_RANGES[i][j + 1])); 1265 return new Subtraction(new Union(new CharClass[] { base, new CharRange(NONBMP_MIN, NONBMP_MAX) }), 1266 new Union(assignedRanges)); 1267 } 1268 if (name.equals("Pi")) 1269 return makeCharClass(CATEGORY_Pi); 1270 if (name.equals("Pf")) 1271 return makeCharClass(CATEGORY_Pf); 1272 return base; 1273 } 1274 List classes = new Vector (); 1275 classes.add(base); 1276 int[] addRanges = Categories.CATEGORY_RANGES[sci/2]; 1277 for (int i = 0; i < addRanges.length; i += 2) 1278 classes.add(new CharRange(addRanges[i], addRanges[i + 1])); 1279 if (name.equals("Lu")) 1280 classes.add(new SingleChar(UNICODE_3_1_ADD_Lu)); 1281 else if (name.equals("Ll")) 1282 classes.add(new SingleChar(UNICODE_3_1_ADD_Ll)); 1283 else if (name.equals("Nl")) 1284 classes.add(new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN, UNICODE_3_1_CHANGE_No_to_Nl_MAX)); 1285 else if (name.equals("No")) 1286 return new Subtraction(new Union(classes), 1287 new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN, 1288 UNICODE_3_1_CHANGE_No_to_Nl_MAX)); 1289 return new Union(classes); 1290 } 1291 1292 private static CharClass makeCharClass(String members) { 1293 List list = new Vector (); 1294 for (int i = 0, len = members.length(); i < len; i++) 1295 list.add(new SingleChar(members.charAt(i))); 1296 return new Union(list); 1297 } 1298 1299 public static void main(String [] args) throws RegexSyntaxException { 1300 String s = translate(args[0]); 1301 for (int i = 0, len = s.length(); i < len; i++) { 1302 char c = s.charAt(i); 1303 if (c >= 0x20 && c <= 0x7e) 1304 System.err.print(c); 1305 else { 1306 System.err.print("\\u"); 1307 for (int shift = 12; shift >= 0; shift -= 4) 1308 System.err.print("0123456789ABCDEF".charAt((c >> shift) & 0xF)); 1309 } 1310 } 1311 System.err.println(); 1312 } 1313} 1314 | Popular Tags |