1 package net.sf.saxon.type; 2 3 import net.sf.saxon.om.XMLChar; 4 import net.sf.saxon.om.FastStringBuffer; 5 6 import java.math.BigDecimal ; 7 import java.util.*; 8 9 15 public class RegexTranslator { 16 17 18 24 25 private final CharSequence regExp; 26 private boolean isXPath; 27 private int pos = 0; 28 private final int length; 29 private char curChar; 30 private boolean eos = false; 31 private final FastStringBuffer result = new FastStringBuffer(32); 32 33 private static final String categories = "LMNPZSC"; 34 private static final CharClass[] categoryCharClasses = new CharClass[categories.length()]; 35 private static final String subCategories = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoZsZlZpSmScSkSoCcCfCoCn"; 36 private static final CharClass[] subCategoryCharClasses = new CharClass[subCategories.length() / 2]; 37 38 private static final int NONBMP_MIN = 0x10000; 39 private static final int NONBMP_MAX = 0x10FFFF; 40 private static final char SURROGATE2_MIN = '\uDC00'; 41 private static final char SURROGATE2_MAX = '\uDFFF'; 42 43 45 private static final String [] blockNames = { 46 "BasicLatin", 47 "Latin-1Supplement", 48 "LatinExtended-A", 49 "LatinExtended-B", 50 "IPAExtensions", 51 "SpacingModifierLetters", 52 "CombiningDiacriticalMarks", 53 "Greek", 54 "Cyrillic", 55 "Armenian", 56 "Hebrew", 57 "Arabic", 58 "Syriac", 59 "Thaana", 60 "Devanagari", 61 "Bengali", 62 "Gurmukhi", 63 "Gujarati", 64 "Oriya", 65 "Tamil", 66 "Telugu", 67 "Kannada", 68 "Malayalam", 69 "Sinhala", 70 "Thai", 71 "Lao", 72 "Tibetan", 73 "Myanmar", 74 "Georgian", 75 "HangulJamo", 76 "Ethiopic", 77 "Cherokee", 78 "UnifiedCanadianAboriginalSyllabics", 79 "Ogham", 80 "Runic", 81 "Khmer", 82 "Mongolian", 83 "LatinExtendedAdditional", 84 "GreekExtended", 85 "GeneralPunctuation", 86 "SuperscriptsandSubscripts", 87 "CurrencySymbols", 88 "CombiningMarksforSymbols", 89 "LetterlikeSymbols", 90 "NumberForms", 91 "Arrows", 92 "MathematicalOperators", 93 "MiscellaneousTechnical", 94 "ControlPictures", 95 "OpticalCharacterRecognition", 96 "EnclosedAlphanumerics", 97 "BoxDrawing", 98 "BlockElements", 99 "GeometricShapes", 100 "MiscellaneousSymbols", 101 "Dingbats", 102 "BraillePatterns", 103 "CJKRadicalsSupplement", 104 "KangxiRadicals", 105 "IdeographicDescriptionCharacters", 106 "CJKSymbolsandPunctuation", 107 "Hiragana", 108 "Katakana", 109 "Bopomofo", 110 "HangulCompatibilityJamo", 111 "Kanbun", 112 "BopomofoExtended", 113 "EnclosedCJKLettersandMonths", 114 "CJKCompatibility", 115 "CJKUnifiedIdeographsExtensionA", 116 "CJKUnifiedIdeographs", 117 "YiSyllables", 118 "YiRadicals", 119 "HangulSyllables", 120 "CJKCompatibilityIdeographs", 123 "AlphabeticPresentationForms", 124 "ArabicPresentationForms-A", 125 "CombiningHalfMarks", 126 "CJKCompatibilityForms", 127 "SmallFormVariants", 128 "ArabicPresentationForms-B", 129 "Specials", 130 "HalfwidthandFullwidthForms", 131 "Specials" 132 }; 133 134 135 138 private static final String [] specialBlockNames = { 139 "OldItalic", 140 "Gothic", 141 "Deseret", 142 "ByzantineMusicalSymbols", 143 "MusicalSymbols", 144 "MathematicalAlphanumericSymbols", 145 "CJKUnifiedIdeographsExtensionB", 146 "CJKCompatibilityIdeographsSupplement", 147 "Tags", 148 "PrivateUse", 149 "HighSurrogates", 150 "HighPrivateUseSurrogates", 151 "LowSurrogates", 152 }; 153 154 156 static final String CATEGORY_NAMES = "NoLoMnCfLlNlPoLuMcNdSoSmCo"; 157 158 static final int[][] CATEGORY_RANGES = { 159 { 160 0x10107, 0x10133, 162 0x10320, 0x10323 163 }, 164 { 165 0x10000, 0x1000b, 167 0x1000d, 0x10026, 168 0x10028, 0x1003a, 169 0x1003c, 0x1003d, 170 0x1003f, 0x1004d, 171 0x10050, 0x1005d, 172 0x10080, 0x100fa, 173 0x10300, 0x1031e, 174 0x10330, 0x10349, 175 0x10380, 0x1039d, 176 0x10450, 0x1049d, 177 0x10800, 0x10805, 178 0x10808, 0x10808, 179 0x1080a, 0x10835, 180 0x10837, 0x10838, 181 0x1083c, 0x1083c, 182 0x1083f, 0x1083f, 183 0x20000, 0x2a6d6, 184 0x2f800, 0x2fa1d 185 }, 186 { 187 0x1d167, 0x1d169, 189 0x1d17b, 0x1d182, 190 0x1d185, 0x1d18b, 191 0x1d1aa, 0x1d1ad, 192 0xe0100, 0xe01ef 193 }, 194 { 195 0x1d173, 0x1d17a, 197 0xe0001, 0xe0001, 198 0xe0020, 0xe007f 199 }, 200 { 201 0x10428, 0x1044f, 203 0x1d41a, 0x1d433, 204 0x1d44e, 0x1d454, 205 0x1d456, 0x1d467, 206 0x1d482, 0x1d49b, 207 0x1d4b6, 0x1d4b9, 208 0x1d4bb, 0x1d4bb, 209 0x1d4bd, 0x1d4c3, 210 0x1d4c5, 0x1d4cf, 211 0x1d4ea, 0x1d503, 212 0x1d51e, 0x1d537, 213 0x1d552, 0x1d56b, 214 0x1d586, 0x1d59f, 215 0x1d5ba, 0x1d5d3, 216 0x1d5ee, 0x1d607, 217 0x1d622, 0x1d63b, 218 0x1d656, 0x1d66f, 219 0x1d68a, 0x1d6a3, 220 0x1d6c2, 0x1d6da, 221 0x1d6dc, 0x1d6e1, 222 0x1d6fc, 0x1d714, 223 0x1d716, 0x1d71b, 224 0x1d736, 0x1d74e, 225 0x1d750, 0x1d755, 226 0x1d770, 0x1d788, 227 0x1d78a, 0x1d78f, 228 0x1d7aa, 0x1d7c2, 229 0x1d7c4, 0x1d7c9 230 }, 231 { 232 0x1034a, 0x1034a 234 }, 235 { 236 0x10100, 0x10101, 238 0x1039f, 0x1039f 239 }, 240 { 241 0x10400, 0x10427, 243 0x1d400, 0x1d419, 244 0x1d434, 0x1d44d, 245 0x1d468, 0x1d481, 246 0x1d49c, 0x1d49c, 247 0x1d49e, 0x1d49f, 248 0x1d4a2, 0x1d4a2, 249 0x1d4a5, 0x1d4a6, 250 0x1d4a9, 0x1d4ac, 251 0x1d4ae, 0x1d4b5, 252 0x1d4d0, 0x1d4e9, 253 0x1d504, 0x1d505, 254 0x1d507, 0x1d50a, 255 0x1d50d, 0x1d514, 256 0x1d516, 0x1d51c, 257 0x1d538, 0x1d539, 258 0x1d53b, 0x1d53e, 259 0x1d540, 0x1d544, 260 0x1d546, 0x1d546, 261 0x1d54a, 0x1d550, 262 0x1d56c, 0x1d585, 263 0x1d5a0, 0x1d5b9, 264 0x1d5d4, 0x1d5ed, 265 0x1d608, 0x1d621, 266 0x1d63c, 0x1d655, 267 0x1d670, 0x1d689, 268 0x1d6a8, 0x1d6c0, 269 0x1d6e2, 0x1d6fa, 270 0x1d71c, 0x1d734, 271 0x1d756, 0x1d76e, 272 0x1d790, 0x1d7a8 273 }, 274 { 275 0x1d165, 0x1d166, 277 0x1d16d, 0x1d172 278 }, 279 { 280 0x104a0, 0x104a9, 282 0x1d7ce, 0x1d7ff 283 }, 284 { 285 0x10102, 0x10102, 287 0x10137, 0x1013f, 288 0x1d000, 0x1d0f5, 289 0x1d100, 0x1d126, 290 0x1d12a, 0x1d164, 291 0x1d16a, 0x1d16c, 292 0x1d183, 0x1d184, 293 0x1d18c, 0x1d1a9, 294 0x1d1ae, 0x1d1dd, 295 0x1d300, 0x1d356 296 }, 297 { 298 0x1d6c1, 0x1d6c1, 300 0x1d6db, 0x1d6db, 301 0x1d6fb, 0x1d6fb, 302 0x1d715, 0x1d715, 303 0x1d735, 0x1d735, 304 0x1d74f, 0x1d74f, 305 0x1d76f, 0x1d76f, 306 0x1d789, 0x1d789, 307 0x1d7a9, 0x1d7a9, 308 0x1d7c3, 0x1d7c3 309 }, 310 { 311 0xf0000, 0xffffd, 313 0x100000, 0x10fffd 314 } 315 }; 316 317 319 322 private static final CharClass[] specialBlockCharClasses = { 323 new CharRange(0x10300, 0x1032F), 324 new CharRange(0x10330, 0x1034F), 325 new CharRange(0x10400, 0x1044F), 326 new CharRange(0x1D000, 0x1D0FF), 327 new CharRange(0x1D100, 0x1D1FF), 328 new CharRange(0x1D400, 0x1D7FF), 329 new CharRange(0x20000, 0x2A6D6), 330 new CharRange(0x2F800, 0x2FA1F), 331 new CharRange(0xE0000, 0xE007F), 332 new Union(new CharClass[]{ 333 new CharRange(0xE000, 0xF8FF), 334 new CharRange(0xF0000, 0xFFFFD), 335 new CharRange(0x100000, 0x10FFFD) 336 }), 337 Empty.getInstance(), 338 Empty.getInstance(), 339 Empty.getInstance() 340 }; 341 342 private static final CharClass DOT = new Complement(new Union(new CharClass[]{new SingleChar('\n'), new SingleChar('\r')})); 343 344 private static final CharClass ESC_d = new Property("Nd"); 345 346 private static final CharClass ESC_D = new Complement(ESC_d); 347 348 private static final CharClass ESC_W = new Union(new CharClass[]{new Property("P"), new Property("Z"), new Property("C")}); 349 350 private static final CharClass ESC_w = new Complement(ESC_W); 351 352 private static final CharClass ESC_s = new Union(new CharClass[]{ 353 new SingleChar(' '), 354 new SingleChar('\n'), 355 new SingleChar('\r'), 356 new SingleChar('\t') 357 }); 358 359 static final String NMSTRT_INCLUDES = 362 "\u003A\u005F\u02BB\u02BC\u02BD\u02BE\u02BF\u02C0\u02C1\u0559" + 363 "\u06E5\u06E6\u212E"; 364 static final String NMSTRT_EXCLUDE_RANGES = 365 "\u00AA\u00BA\u0132\u0133\u013F\u0140\u0149\u0149\u017F\u017F" + 366 "\u01C4\u01CC\u01F1\u01F3\u01F6\u01F9\u0218\u0233\u02A9\u02AD" + 367 "\u03D7\u03D7\u03DB\u03DB\u03DD\u03DD\u03DF\u03DF\u03E1\u03E1" + 368 "\u0400\u0400\u040D\u040D\u0450\u0450\u045D\u045D\u048C\u048F" + 369 "\u04EC\u04ED\u0587\u0587\u06B8\u06B9\u06BF\u06BF\u06CF\u06CF" + 370 "\u06FA\u07A5\u0950\u0950\u0AD0\u0AD0\u0D85\u0DC6\u0E2F\u0E2F" + 371 "\u0EAF\u0EAF\u0EDC\u0F00\u0F6A\u1055\u1101\u1101\u1104\u1104" + 372 "\u1108\u1108\u110A\u110A\u110D\u110D\u1113\u113B\u113D\u113D" + 373 "\u113F\u113F\u1141\u114B\u114D\u114D\u114F\u114F\u1151\u1153" + 374 "\u1156\u1158\u1162\u1162\u1164\u1164\u1166\u1166\u1168\u1168" + 375 "\u116A\u116C\u116F\u1171\u1174\u1174\u1176\u119D\u119F\u11A2" + 376 "\u11A9\u11AA\u11AC\u11AD\u11B0\u11B6\u11B9\u11B9\u11BB\u11BB" + 377 "\u11C3\u11EA\u11EC\u11EF\u11F1\u11F8\u1200\u18A8\u207F\u2124" + 378 "\u2128\u2128\u212C\u212D\u212F\u217F\u2183\u3006\u3038\u303A" + 379 "\u3131\u4DB5\uA000\uA48C\uF900\uFFDC"; 380 static final String NMSTRT_CATEGORIES = "LlLuLoLtNl"; 381 static final String NMCHAR_INCLUDES = 382 "\u002D\u002E\u003A\u005F\u00B7\u0387\u212E"; 383 static final String NMCHAR_EXCLUDE_RANGES = 384 "\u00AA\u00B5\u00BA\u00BA\u0132\u0133\u013F\u0140\u0149\u0149" + 385 "\u017F\u017F\u01C4\u01CC\u01F1\u01F3\u01F6\u01F9\u0218\u0233" + 386 "\u02A9\u02B8\u02E0\u02EE\u0346\u034E\u0362\u037A\u03D7\u03D7" + 387 "\u03DB\u03DB\u03DD\u03DD\u03DF\u03DF\u03E1\u03E1\u0400\u0400" + 388 "\u040D\u040D\u0450\u0450\u045D\u045D\u0488\u048F\u04EC\u04ED" + 389 "\u0587\u0587\u0653\u0655\u06B8\u06B9\u06BF\u06BF\u06CF\u06CF" + 390 "\u06FA\u07B0\u0950\u0950\u0AD0\u0AD0\u0D82\u0DF3\u0E2F\u0E2F" + 391 "\u0EAF\u0EAF\u0EDC\u0F00\u0F6A\u0F6A\u0F96\u0F96\u0FAE\u0FB0" + 392 "\u0FB8\u0FB8\u0FBA\u1059\u1101\u1101\u1104\u1104\u1108\u1108" + 393 "\u110A\u110A\u110D\u110D\u1113\u113B\u113D\u113D\u113F\u113F" + 394 "\u1141\u114B\u114D\u114D\u114F\u114F\u1151\u1153\u1156\u1158" + 395 "\u1162\u1162\u1164\u1164\u1166\u1166\u1168\u1168\u116A\u116C" + 396 "\u116F\u1171\u1174\u1174\u1176\u119D\u119F\u11A2\u11A9\u11AA" + 397 "\u11AC\u11AD\u11B0\u11B6\u11B9\u11B9\u11BB\u11BB\u11C3\u11EA" + 398 "\u11EC\u11EF\u11F1\u11F8\u1200\u18A9\u207F\u207F\u20DD\u20E0" + 399 "\u20E2\u2124\u2128\u2128\u212C\u212D\u212F\u217F\u2183\u2183" + 400 "\u3006\u3006\u3038\u303A\u3131\u4DB5\uA000\uA48C\uF900\uFFDC"; 401 static final String NMCHAR_CATEGORIES = "LlLuLoLtNlMcMeMnLmNd"; 402 404 private static final CharClass ESC_S = new Complement(ESC_s); 405 406 private static final CharClass ESC_i = makeCharClass(NMSTRT_CATEGORIES, 407 NMSTRT_INCLUDES, 408 NMSTRT_EXCLUDE_RANGES); 409 410 private static final CharClass ESC_I = new Complement(ESC_i); 411 412 private static final CharClass ESC_c = makeCharClass(NMCHAR_CATEGORIES, 413 NMCHAR_INCLUDES, 414 NMCHAR_EXCLUDE_RANGES); 415 416 private static final CharClass ESC_C = new Complement(ESC_c); 417 418 private static final char EOS = '\0'; 419 420 private RegexTranslator(CharSequence regExp) { 421 this.regExp = regExp; 422 this.length = regExp.length(); 423 advance(); 424 } 425 426 443 public static String translate(CharSequence regexp, boolean xpath) throws RegexSyntaxException { 444 RegexTranslator tr = new RegexTranslator(regexp); 445 tr.isXPath = xpath; 446 tr.translateTop(); 447 return tr.result.toString(); 448 } 449 450 private void advance() { 451 if (pos < length) 452 curChar = regExp.charAt(pos++); 453 else { 454 pos++; 455 curChar = EOS; 456 eos = true; 457 } 458 } 459 460 private void translateTop() throws RegexSyntaxException { 461 translateRegExp(); 462 if (!eos) 463 throw makeException("expected end of string"); 464 } 465 466 private void translateRegExp() throws RegexSyntaxException { 467 translateBranch(); 468 while (curChar == '|') { 469 copyCurChar(); 470 translateBranch(); 471 } 472 } 473 474 private void translateBranch() throws RegexSyntaxException { 475 while (translateAtom()) 476 translateQuantifier(); 477 } 478 479 private void translateQuantifier() throws RegexSyntaxException { 480 switch (curChar) { 481 case '*': 482 case '?': 483 case '+': 484 copyCurChar(); 485 break; 486 case '{': 487 copyCurChar(); 488 translateQuantity(); 489 expect('}'); 490 copyCurChar(); 491 break; 492 default: 493 return; 494 } 495 if (curChar=='?' && isXPath) { 496 copyCurChar(); 497 } 498 } 499 500 private void translateQuantity() throws RegexSyntaxException { 501 String lower = parseQuantExact().toString(); 502 int lowerValue = -1; 503 try { 504 lowerValue = Integer.parseInt(lower); 505 result.append(lower); 506 } catch (NumberFormatException e) { 507 result.append(""+Integer.MAX_VALUE); 509 } 510 if (curChar == ',') { 511 copyCurChar(); 512 if (curChar != '}') { 513 String upper = parseQuantExact().toString(); 514 try { 515 int upperValue = Integer.parseInt(upper); 516 result.append(upper); 517 if (lowerValue < 0 || upperValue < lowerValue) 518 throw makeException("invalid range in quantifier"); 519 } catch (NumberFormatException e) { 520 result.append(""+Integer.MAX_VALUE); 521 if (lowerValue < 0 && new BigDecimal (lower).compareTo(new BigDecimal (upper)) > 0) 522 throw makeException("invalid range in quantifier"); 523 } 524 } 525 } 526 } 527 528 private CharSequence parseQuantExact() throws RegexSyntaxException { 529 FastStringBuffer buf = new FastStringBuffer(10); 530 do { 531 if ("0123456789".indexOf(curChar) < 0) 532 throw makeException("expected digit in quantifier"); 533 buf.append(curChar); 534 advance(); 535 } while (curChar != ',' && curChar != '}'); 536 return buf; 537 } 538 539 private void copyCurChar() { 540 result.append(curChar); 541 advance(); 542 } 543 544 static final int NONE = -1; 545 static final int SOME = 0; 546 static final int ALL = 1; 547 548 static final String SURROGATES1_CLASS = "[\uD800-\uDBFF]"; 549 static final String SURROGATES2_CLASS = "[\uDC00-\uDFFF]"; 550 static final String NOT_ALLOWED_CLASS = "[\u0000&&[^\u0000]]"; 551 552 static final class Range implements Comparable { 553 private final int min; 554 private final int max; 555 556 Range(int min, int max) { 557 this.min = min; 558 this.max = max; 559 } 560 561 int getMin() { 562 return min; 563 } 564 565 int getMax() { 566 return max; 567 } 568 569 public int compareTo(Object o) { 570 Range other = (Range)o; 571 if (this.min < other.min) 572 return -1; 573 if (this.min > other.min) 574 return 1; 575 if (this.max > other.max) 576 return -1; 577 if (this.max < other.max) 578 return 1; 579 return 0; 580 } 581 } 582 583 static abstract class CharClass { 584 585 private final int containsBmp; 586 private final int containsNonBmp; 589 590 protected CharClass(int containsBmp, int containsNonBmp) { 591 this.containsBmp = containsBmp; 592 this.containsNonBmp = containsNonBmp; 593 } 594 595 int getContainsBmp() { 596 return containsBmp; 597 } 598 599 int getContainsNonBmp() { 600 return containsNonBmp; 601 } 602 603 final void output(FastStringBuffer buf) { 604 switch (containsNonBmp) { 605 case NONE: 606 if (containsBmp == NONE) 607 buf.append(NOT_ALLOWED_CLASS); 608 else 609 outputBmp(buf); 610 break; 611 case ALL: 612 buf.append("(?:"); 613 if (containsBmp == NONE) { 614 buf.append(SURROGATES1_CLASS); 615 buf.append(SURROGATES2_CLASS); 616 } else { 617 outputBmp(buf); 618 buf.append(SURROGATES2_CLASS); 619 buf.append('?'); 620 } 621 buf.append(')'); 622 break; 623 case SOME: 624 buf.append("(?:"); 625 boolean needSep = false; 626 if (containsBmp != NONE) { 627 needSep = true; 628 outputBmp(buf); 629 } 630 List ranges = new ArrayList(10); 631 addNonBmpRanges(ranges); 632 sortRangeList(ranges); 633 String hi = highSurrogateRanges(ranges); 634 if (hi.length() > 0) { 635 if (needSep) 636 buf.append('|'); 637 else 638 needSep = true; 639 buf.append('['); 640 for (int i = 0, len = hi.length(); i < len; i += 2) { 641 char min = hi.charAt(i); 642 char max = hi.charAt(i + 1); 643 if (min == max) 644 buf.append(min); 645 else { 646 buf.append(min); 647 buf.append('-'); 648 buf.append(max); 649 } 650 } 651 buf.append(']'); 652 buf.append(SURROGATES2_CLASS); 653 } 654 String lo = lowSurrogateRanges(ranges); 655 for (int i = 0, len = lo.length(); i < len; i += 3) { 656 if (needSep) 657 buf.append('|'); 658 else 659 needSep = true; 660 buf.append(lo.charAt(i)); 661 char min = lo.charAt(i + 1); 662 char max = lo.charAt(i + 2); 663 if (min == max && (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i))) 664 buf.append(min); 665 else { 666 buf.append('['); 667 for (; ;) { 668 if (min == max) 669 buf.append(min); 670 else { 671 buf.append(min); 672 buf.append('-'); 673 buf.append(max); 674 } 675 if (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i)) 676 break; 677 i += 3; 678 min = lo.charAt(i + 1); 679 max = lo.charAt(i + 2); 680 } 681 buf.append(']'); 682 } 683 } 684 if (!needSep) 685 buf.append(NOT_ALLOWED_CLASS); 686 buf.append(')'); 687 break; 688 } 689 } 690 691 static String highSurrogateRanges(List ranges) { 692 FastStringBuffer highRanges = new FastStringBuffer(ranges.size() * 2); 693 for (int i = 0, len = ranges.size(); i < len; i++) { 694 Range r = (Range)ranges.get(i); 695 char min1 = XMLChar.highSurrogate(r.getMin()); 696 char min2 = XMLChar.lowSurrogate(r.getMin()); 697 char max1 = XMLChar.highSurrogate(r.getMax()); 698 char max2 = XMLChar.lowSurrogate(r.getMax()); 699 if (min2 != SURROGATE2_MIN) 700 min1++; 701 if (max2 != SURROGATE2_MAX) 702 max1--; 703 if (max1 >= min1) { 704 highRanges.append(min1); 705 highRanges.append(max1); 706 } 707 } 708 return highRanges.toString(); 709 } 710 711 static String lowSurrogateRanges(List ranges) { 712 FastStringBuffer lowRanges = new FastStringBuffer(ranges.size() * 2); 713 for (int i = 0, len = ranges.size(); i < len; i++) { 714 Range r = (Range)ranges.get(i); 715 char min1 = XMLChar.highSurrogate(r.getMin()); 716 char min2 = XMLChar.lowSurrogate(r.getMin()); 717 char max1 = XMLChar.highSurrogate(r.getMax()); 718 char max2 = XMLChar.lowSurrogate(r.getMax()); 719 if (min1 == max1) { 720 if (min2 != SURROGATE2_MIN || max2 != SURROGATE2_MAX) { 721 lowRanges.append(min1); 722 lowRanges.append(min2); 723 lowRanges.append(max2); 724 } 725 } else { 726 if (min2 != SURROGATE2_MIN) { 727 lowRanges.append(min1); 728 lowRanges.append(min2); 729 lowRanges.append(SURROGATE2_MAX); 730 } 731 if (max2 != SURROGATE2_MAX) { 732 lowRanges.append(max1); 733 lowRanges.append(SURROGATE2_MIN); 734 lowRanges.append(max2); 735 } 736 } 737 } 738 return lowRanges.toString(); 739 } 740 741 abstract void outputBmp(FastStringBuffer buf); 742 743 abstract void outputComplementBmp(FastStringBuffer buf); 744 745 int getSingleChar() { 746 return -1; 747 } 748 749 void addNonBmpRanges(List ranges) { 750 } 751 752 753 static void sortRangeList(List ranges) { 754 Collections.sort(ranges); 755 int toIndex = 0; 756 int fromIndex = 0; 757 int len = ranges.size(); 758 while (fromIndex < len) { 759 Range r = (Range)ranges.get(fromIndex); 760 int min = r.getMin(); 761 int max = r.getMax(); 762 while (++fromIndex < len) { 763 Range r2 = (Range)ranges.get(fromIndex); 764 if (r2.getMin() > max + 1) 765 break; 766 if (r2.getMax() > max) 767 max = r2.getMax(); 768 } 769 if (max != r.getMax()) 770 r = new Range(min, max); 771 ranges.set(toIndex++, r); 772 } 773 while (len > toIndex) 774 ranges.remove(--len); 775 } 776 777 } 778 779 static abstract class SimpleCharClass extends CharClass { 780 SimpleCharClass(int containsBmp, int containsNonBmp) { 781 super(containsBmp, containsNonBmp); 782 } 783 784 void outputBmp(FastStringBuffer buf) { 785 buf.append('['); 786 inClassOutputBmp(buf); 787 buf.append(']'); 788 } 789 790 void outputComplementBmp(FastStringBuffer buf) { 792 if (getContainsBmp() == NONE) 793 buf.append("[\u0000-\uFFFF]"); 794 else { 795 buf.append("[^"); 796 inClassOutputBmp(buf); 797 buf.append(']'); 798 } 799 } 800 801 abstract void inClassOutputBmp(FastStringBuffer buf); 802 } 803 804 static class SingleChar extends SimpleCharClass { 805 private final char c; 806 807 SingleChar(char c) { 808 super(SOME, NONE); 809 this.c = c; 810 } 811 812 int getSingleChar() { 813 return c; 814 } 815 816 void outputBmp(FastStringBuffer buf) { 817 inClassOutputBmp(buf); 818 } 819 820 void inClassOutputBmp(FastStringBuffer buf) { 821 if (isJavaMetaChar(c)) { 822 buf.append('\\'); 823 buf.append(c); 824 } else { 825 switch (c) { 826 case '\r': 827 buf.append("\\r"); 828 break; 829 case '\n': 830 buf.append("\\n"); 831 break; 832 case '\t': 833 buf.append("\\t"); 834 break; 835 case ' ': 836 buf.append("\\x20"); 837 break; 838 default: 839 buf.append(c); 840 } 841 } 842 return; 843 } 844 845 } 846 847 static class WideSingleChar extends SimpleCharClass { 848 private final int c; 849 850 WideSingleChar(int c) { 851 super(NONE, SOME); 852 this.c = c; 853 } 854 855 void inClassOutputBmp(FastStringBuffer buf) { 856 throw new RuntimeException ("BMP output botch"); 857 } 858 859 int getSingleChar() { 860 return c; 861 } 862 863 void addNonBmpRanges(List ranges) { 864 ranges.add(new Range(c, c)); 865 } 866 } 867 868 static class Empty extends SimpleCharClass { 869 private static final Empty instance = new Empty(); 870 871 private Empty() { 872 super(NONE, NONE); 873 } 874 875 static Empty getInstance() { 876 return instance; 877 } 878 879 void inClassOutputBmp(FastStringBuffer buf) { 880 throw new RuntimeException ("BMP output botch"); 881 } 882 883 } 884 885 static class CharRange extends SimpleCharClass { 886 private final int lower; 887 private final int upper; 888 889 CharRange(int lower, int upper) { 890 super(lower < NONBMP_MIN ? SOME : NONE, 891 upper >= NONBMP_MIN ? SOME : NONE); 893 this.lower = lower; 894 this.upper = upper; 895 } 896 897 void inClassOutputBmp(FastStringBuffer buf) { 898 if (lower >= NONBMP_MIN) 899 throw new RuntimeException ("BMP output botch"); 900 if (isJavaMetaChar((char)lower)) 901 buf.append('\\'); 902 buf.append((char)lower); 903 buf.append('-'); 904 if (upper < NONBMP_MIN) { 905 if (isJavaMetaChar((char)upper)) 906 buf.append('\\'); 907 buf.append((char)upper); 908 } else 909 buf.append('\uFFFF'); 910 } 911 912 void addNonBmpRanges(List ranges) { 913 if (upper >= NONBMP_MIN) 914 ranges.add(new Range(lower < NONBMP_MIN ? NONBMP_MIN : lower, upper)); 915 } 916 } 917 918 static class Property extends SimpleCharClass { 919 private final String name; 920 921 Property(String name) { 922 super(SOME, NONE); 923 this.name = name; 924 } 925 926 void outputBmp(FastStringBuffer buf) { 927 inClassOutputBmp(buf); 928 } 929 930 void inClassOutputBmp(FastStringBuffer buf) { 931 buf.append("\\p{"); 932 buf.append(name); 933 buf.append('}'); 934 } 935 936 void outputComplementBmp(FastStringBuffer buf) { 937 buf.append("\\P{"); 938 buf.append(name); 939 buf.append('}'); 940 } 941 } 942 943 static class Subtraction extends CharClass { 944 private final CharClass cc1; 945 private final CharClass cc2; 946 947 Subtraction(CharClass cc1, CharClass cc2) { 948 super(Math.min(cc1.getContainsBmp(), -cc2.getContainsBmp()), 951 Math.min(cc1.getContainsNonBmp(), -cc2.getContainsNonBmp())); 952 this.cc1 = cc1; 953 this.cc2 = cc2; 954 } 955 956 void outputBmp(FastStringBuffer buf) { 957 buf.append('['); 958 cc1.outputBmp(buf); 959 buf.append("&&"); 960 cc2.outputComplementBmp(buf); 961 buf.append(']'); 962 } 963 964 void outputComplementBmp(FastStringBuffer buf) { 965 buf.append('['); 966 cc1.outputComplementBmp(buf); 967 cc2.outputBmp(buf); 968 buf.append(']'); 969 } 970 971 void addNonBmpRanges(List ranges) { 972 List posList = new Vector(); 973 cc1.addNonBmpRanges(posList); 974 List negList = new Vector(); 975 cc2.addNonBmpRanges(negList); 976 sortRangeList(posList); 977 sortRangeList(negList); 978 Iterator negIter = negList.iterator(); 979 Range negRange; 980 if (negIter.hasNext()) 981 negRange = (Range)negIter.next(); 982 else 983 negRange = null; 984 for (int i = 0, len = posList.size(); i < len; i++) { 985 Range posRange = (Range)posList.get(i); 986 while (negRange != null && negRange.getMax() < posRange.getMin()) { 987 if (negIter.hasNext()) 988 negRange = (Range)negIter.next(); 989 else 990 negRange = null; 991 } 992 int min = posRange.getMin(); 994 while (negRange != null && negRange.getMin() <= posRange.getMax()) { 995 if (min < negRange.getMin()) { 996 ranges.add(new Range(min, negRange.getMin() - 1)); 997 } 998 min = negRange.getMax() + 1; 999 if (min > posRange.getMax()) 1000 break; 1001 if (negIter.hasNext()) 1002 negRange = (Range)negIter.next(); 1003 else 1004 negRange = null; 1005 } 1006 if (min <= posRange.getMax()) 1007 ranges.add(new Range(min, posRange.getMax())); 1008 } 1009 } 1010 } 1011 1012 static class Union extends CharClass { 1013 private final List members; 1014 1015 Union(CharClass[] v) { 1016 this(toList(v)); 1017 } 1018 1019 private static List toList(CharClass[] v) { 1020 List members = new Vector(); 1021 for (int i = 0; i < v.length; i++) 1022 members.add(v[i]); 1023 return members; 1024 } 1025 1026 Union(List members) { 1027 super(computeContainsBmp(members), computeContainsNonBmp(members)); 1028 this.members = members; 1029 } 1030 1031 void outputBmp(FastStringBuffer buf) { 1032 buf.append('['); 1033 for (int i = 0, len = members.size(); i < len; i++) { 1034 CharClass cc = (CharClass)members.get(i); 1035 if (cc.getContainsBmp() != NONE) { 1036 if (cc instanceof SimpleCharClass) 1037 ((SimpleCharClass)cc).inClassOutputBmp(buf); 1038 else 1039 cc.outputBmp(buf); 1040 } 1041 } 1042 buf.append(']'); 1043 } 1044 1045 void outputComplementBmp(FastStringBuffer buf) { 1046 boolean first = true; 1047 int len = members.size(); 1048 for (int i = 0; i < len; i++) { 1049 CharClass cc = (CharClass)members.get(i); 1050 if (cc.getContainsBmp() != NONE && cc instanceof SimpleCharClass) { 1051 if (first) { 1052 buf.append("[^"); 1053 first = false; 1054 } 1055 ((SimpleCharClass)cc).inClassOutputBmp(buf); 1056 } 1057 } 1058 for (int i = 0; i < len; i++) { 1059 CharClass cc = (CharClass)members.get(i); 1060 if (cc.getContainsBmp() != NONE && !(cc instanceof SimpleCharClass)) { 1061 if (first) { 1062 buf.append('['); 1063 first = false; 1064 } else 1065 buf.append("&&"); 1066 cc.outputComplementBmp(buf); 1069 } 1070 } 1071 if (first == true) 1072 buf.append("[\u0000-\uFFFF]"); 1074 else 1075 buf.append(']'); 1076 } 1077 1078 void addNonBmpRanges(List ranges) { 1079 for (int i = 0, len = members.size(); i < len; i++) 1080 ((CharClass)members.get(i)).addNonBmpRanges(ranges); 1081 } 1082 1083 private static int computeContainsBmp(List members) { 1084 int ret = NONE; 1085 for (int i = 0, len = members.size(); i < len; i++) 1086 ret = Math.max(ret, ((CharClass)members.get(i)).getContainsBmp()); 1087 return ret; 1088 } 1089 1090 private static int computeContainsNonBmp(List members) { 1091 int ret = NONE; 1092 for (int i = 0, len = members.size(); i < len; i++) 1093 ret = Math.max(ret, ((CharClass)members.get(i)).getContainsNonBmp()); 1094 return ret; 1095 } 1096 } 1097 1098 static class BackReference extends CharClass { 1099 private final int i; 1100 1101 BackReference(int i) { 1102 super(SOME, NONE); 1103 this.i = i; 1104 } 1105 1106 void outputBmp(FastStringBuffer buf) { 1107 inClassOutputBmp(buf); 1108 } 1109 1110 void outputComplementBmp(FastStringBuffer buf) { 1111 inClassOutputBmp(buf); 1112 } 1113 1114 void inClassOutputBmp(FastStringBuffer buf) { 1115 buf.append("\\" + i); 1116 } 1117 } 1118 1119 1122 public static class RegexSyntaxException extends Exception { 1123 private final int position; 1124 1125 1128 public static final int UNKNOWN_POSITION = -1; 1129 1130 public RegexSyntaxException(String detail) { 1131 this(detail, UNKNOWN_POSITION); 1132 } 1133 1134 public RegexSyntaxException(String detail, int position) { 1135 super(detail); 1136 this.position = position; 1137 } 1138 1139 1146 public int getPosition() { 1147 return position; 1148 } 1149 } 1150 1151 1191 static class Complement extends CharClass { 1192 private final CharClass cc; 1193 1194 Complement(CharClass cc) { 1195 super(-cc.getContainsBmp(), -cc.getContainsNonBmp()); 1196 this.cc = cc; 1197 } 1198 1199 void outputBmp(FastStringBuffer buf) { 1200 cc.outputComplementBmp(buf); 1201 } 1202 1203 void outputComplementBmp(FastStringBuffer buf) { 1204 cc.outputBmp(buf); 1205 } 1206 1207 void addNonBmpRanges(List ranges) { 1208 List tem = new Vector(); 1209 cc.addNonBmpRanges(tem); 1210 sortRangeList(tem); 1211 int c = NONBMP_MIN; 1212 for (int i = 0, len = tem.size(); i < len; i++) { 1213 Range r = (Range)tem.get(i); 1214 if (r.getMin() > c) 1215 ranges.add(new Range(c, r.getMin() - 1)); 1216 c = r.getMax() + 1; 1217 } 1218 if (c != NONBMP_MAX + 1) 1219 ranges.add(new Range(c, NONBMP_MAX)); 1220 } 1221 } 1222 1223 private boolean translateAtom() throws RegexSyntaxException { 1224 switch (curChar) { 1225 case EOS: 1226 if (!eos) 1227 break; 1228 case '?': 1230 case '*': 1231 case '+': 1232 case ')': 1233 case '{': 1234 case '}': 1235 case '|': 1236 case ']': 1237 return false; 1238 case '(': 1239 copyCurChar(); 1240 translateRegExp(); 1241 expect(')'); 1242 copyCurChar(); 1243 return true; 1244 case '\\': 1245 advance(); 1246 parseEsc().output(result); 1247 return true; 1248 case '[': 1249 advance(); 1250 parseCharClassExpr().output(result); 1251 return true; 1252 case '.': 1253 if (isXPath) { 1254 break; 1257 } else { 1258 DOT.output(result); 1259 advance(); 1260 return true; 1261 } 1262 case '$': 1263 case '^': 1264 if (isXPath) { 1265 copyCurChar(); 1266 return true; 1267 } 1268 result.append('\\'); 1269 break; 1270 } 1271 copyCurChar(); 1272 return true; 1273 } 1274 1275 1276 static private CharClass makeCharClass(String categories, String includes, String excludeRanges) { 1277 List includeList = new Vector(); 1278 for (int i = 0, len = categories.length(); i < len; i += 2) 1279 includeList.add(new Property(categories.substring(i, i + 2))); 1280 for (int i = 0, len = includes.length(); i < len; i++) { 1281 int j = i + 1; 1282 for (; j < len && includes.charAt(j) - includes.charAt(i) == j - i; j++) 1283 ; 1284 --j; 1285 if (i == j - 1) 1286 --j; 1287 if (i == j) 1288 includeList.add(new SingleChar(includes.charAt(i))); 1289 else 1290 includeList.add(new CharRange(includes.charAt(i), includes.charAt(j))); 1291 i = j; 1292 } 1293 List excludeList = new Vector(); 1294 for (int i = 0, len = excludeRanges.length(); i < len; i += 2) { 1295 char min = excludeRanges.charAt(i); 1296 char max = excludeRanges.charAt(i + 1); 1297 if (min == max) 1298 excludeList.add(new SingleChar(min)); 1299 else if (min == max - 1) { 1300 excludeList.add(new SingleChar(min)); 1301 excludeList.add(new SingleChar(max)); 1302 } else 1303 excludeList.add(new CharRange(min, max)); 1304 } 1305 return new Subtraction(new Union(includeList), new Union(excludeList)); 1306 } 1307 1308 private CharClass parseEsc() throws RegexSyntaxException { 1309 switch (curChar) { 1310 case 'n': 1311 advance(); 1312 return new SingleChar('\n'); 1313 case 'r': 1314 advance(); 1315 return new SingleChar('\r'); 1316 case 't': 1317 advance(); 1318 return new SingleChar('\t'); 1319 case '\\': 1320 case '|': 1321 case '.': 1322 case '-': 1323 case '^': 1324 case '?': 1325 case '*': 1326 case '+': 1327 case '(': 1328 case ')': 1329 case '{': 1330 case '}': 1331 case '[': 1332 case ']': 1333 break; 1334 case 's': 1335 advance(); 1336 return ESC_s; 1337 case 'S': 1338 advance(); 1339 return ESC_S; 1340 case 'i': 1341 advance(); 1342 return ESC_i; 1343 case 'I': 1344 advance(); 1345 return ESC_I; 1346 case 'c': 1347 advance(); 1348 return ESC_c; 1349 case 'C': 1350 advance(); 1351 return ESC_C; 1352 case 'd': 1353 advance(); 1354 return ESC_d; 1355 case 'D': 1356 advance(); 1357 return ESC_D; 1358 case 'w': 1359 advance(); 1360 return ESC_w; 1361 case 'W': 1362 advance(); 1363 return ESC_W; 1364 case 'p': 1365 advance(); 1366 return parseProp(); 1367 case 'P': 1368 advance(); 1369 return new Complement(parseProp()); 1370 case '0': case '1': case '2': case '3': case '4': 1371 case '5': case '6': case '7': case '8': case '9': 1372 if (isXPath) { 1373 char c = curChar; 1374 advance(); 1375 return new BackReference(c - '0'); 1376 } else { 1377 throw makeException("digit not allowed after \\"); 1378 } 1379 case '$': 1380 if (isXPath) { 1381 break; 1382 } 1383 default: 1385 throw makeException("invalid escape sequence"); 1386 } 1387 CharClass tem = new SingleChar(curChar); 1388 advance(); 1389 return tem; 1390 } 1391 1392 private CharClass parseProp() throws RegexSyntaxException { 1393 expect('{'); 1394 int start = pos; 1395 for (; ;) { 1396 advance(); 1397 if (curChar == '}') 1398 break; 1399 if (!isAsciiAlnum(curChar) && curChar != '-') 1400 expect('}'); 1401 } 1402 String propertyName = regExp.subSequence(start, pos - 1).toString(); 1403 advance(); 1404 switch (propertyName.length()) { 1405 case 0: 1406 throw makeException("empty property name"); 1407 case 2: 1408 int sci = subCategories.indexOf(propertyName); 1409 if (sci < 0 || sci % 2 == 1) 1410 throw makeException("unknown category"); 1411 return getSubCategoryCharClass(sci / 2); 1412 case 1: 1413 int ci = categories.indexOf(propertyName.charAt(0)); 1414 if (ci < 0) 1415 throw makeException("unknown category", propertyName); 1416 return getCategoryCharClass(ci); 1417 default: 1418 if (!propertyName.startsWith("Is")) 1419 break; 1420 String blockName = propertyName.substring(2); 1421 for (int i = 0; i < specialBlockNames.length; i++) 1422 if (blockName.equals(specialBlockNames[i])) 1423 return specialBlockCharClasses[i]; 1424 if (!isBlock(blockName)) 1425 throw makeException("invalid block name", blockName); 1426 return new Property("In" + blockName); 1427 } 1428 throw makeException("invalid property name", propertyName); 1429 } 1430 1431 private static boolean isBlock(String name) { 1432 for (int i = 0; i < blockNames.length; i++) 1433 if (name.equals(blockNames[i])) 1434 return true; 1435 return false; 1436 } 1437 1438 private static boolean isAsciiAlnum(char c) { 1439 if ('a' <= c && c <= 'z') 1440 return true; 1441 if ('A' <= c && c <= 'Z') 1442 return true; 1443 if ('0' <= c && c <= '9') 1444 return true; 1445 return false; 1446 } 1447 1448 private void expect(char c) throws RegexSyntaxException { 1449 if (curChar != c) 1450 throw makeException("expected", new String (new char[]{c})); 1451 } 1452 1453 private CharClass parseCharClassExpr() throws RegexSyntaxException { 1454 boolean compl; 1455 if (curChar == '^') { 1456 advance(); 1457 compl = true; 1458 } else 1459 compl = false; 1460 List members = new ArrayList(10); 1461 boolean first = true; 1462 do { 1463 CharClass lower = parseCharClassEscOrXmlChar(first); 1464 first = false; 1465 members.add(lower); 1466 if (curChar == '-') { 1467 advance(); 1468 if (curChar == ']') { break; 1470 } 1471 if (curChar == '[') { 1472 break; 1473 } 1474 CharClass upper = parseCharClassEscOrXmlChar(first); 1475 if (lower.getSingleChar() < 0 || upper.getSingleChar() < 0) 1476 throw makeException("multi_range"); 1477 if (lower.getSingleChar() > upper.getSingleChar()) 1478 throw makeException("invalid range (start > end)"); 1479 members.set(members.size() - 1, 1480 new CharRange(lower.getSingleChar(), upper.getSingleChar())); 1481 if (curChar == '-') { 1482 advance(); 1483 expect('['); 1484 break; 1485 } 1486 } 1487 } while (curChar != ']'); 1488 CharClass result; 1489 if (members.size() == 1) 1490 result = (CharClass)members.get(0); 1491 else 1492 result = new Union(members); 1493 if (compl) 1494 result = new Complement(result); 1495 if (curChar == '[') { 1496 advance(); 1497 result = new Subtraction(result, parseCharClassExpr()); 1498 expect(']'); 1499 } 1500 advance(); 1501 return result; 1502 } 1503 1504 private CharClass parseCharClassEscOrXmlChar(boolean first) throws RegexSyntaxException { 1505 switch (curChar) { 1506 case EOS: 1507 if (eos) 1508 expect(']'); 1509 break; 1510 case '\\': 1511 advance(); 1512 return parseEsc(); 1513 case '[': 1514 case ']': 1515 throw makeException("character must be escaped", new String (new char[]{curChar})); 1516 case '-': 1517 if (!first) { 1518 throw makeException("character must be escaped", new String (new char[]{curChar})); 1519 } 1520 break; 1521 } 1522 CharClass tem; 1523 if (XMLChar.isSurrogate(curChar)) { 1524 if (!XMLChar.isHighSurrogate(curChar)) 1525 throw makeException("invalid surrogate pair"); 1526 char c1 = curChar; 1527 advance(); 1528 if (!XMLChar.isLowSurrogate(curChar)) 1529 throw makeException("invalid surrogate pair"); 1530 tem = new WideSingleChar(XMLChar.supplemental(c1, curChar)); 1531 } else 1532 tem = new SingleChar(curChar); 1533 advance(); 1534 return tem; 1535 } 1536 1537 private RegexSyntaxException makeException(String key) { 1538 return new RegexSyntaxException("Error at character " + (pos - 1) + 1539 " in regular expression: " + key); 1540 } 1541 1542 private RegexSyntaxException makeException(String key, String arg) { 1543 return new RegexSyntaxException("Error at character " + (pos - 1) + 1544 " in regular expression: " + key + 1545 " (" + arg + ')'); 1546 } 1547 1548 private static boolean isJavaMetaChar(char c) { 1549 switch (c) { 1550 case '\\': 1551 case '^': 1552 case '?': 1553 case '*': 1554 case '+': 1555 case '(': 1556 case ')': 1557 case '{': 1558 case '}': 1559 case '|': 1560 case '[': 1561 case ']': 1562 case '-': 1563 case '&': 1564 case '$': 1565 case '.': 1566 return true; 1567 } 1568 return false; 1569 } 1570 1571 private static synchronized CharClass getCategoryCharClass(int ci) { 1572 if (categoryCharClasses[ci] == null) 1573 categoryCharClasses[ci] = computeCategoryCharClass(categories.charAt(ci)); 1574 return categoryCharClasses[ci]; 1575 } 1576 1577 private static synchronized CharClass getSubCategoryCharClass(int sci) { 1578 if (subCategoryCharClasses[sci] == null) 1579 subCategoryCharClasses[sci] = computeSubCategoryCharClass(subCategories.substring(sci * 2, (sci + 1) * 2)); 1580 return subCategoryCharClasses[sci]; 1581 } 1582 1583 private static final char UNICODE_3_1_ADD_Lu = '\u03F4'; private static final char UNICODE_3_1_ADD_Ll = '\u03F5'; private static final char UNICODE_3_1_CHANGE_No_to_Nl_MIN = '\u16EE'; 1587 private static final char UNICODE_3_1_CHANGE_No_to_Nl_MAX = '\u16F0'; 1588 private static final String CATEGORY_Pi = "\u00AB\u2018\u201B\u201C\u201F\u2039"; private static final String CATEGORY_Pf = "\u00BB\u2019\u201D\u203A"; 1591 private static CharClass computeCategoryCharClass(char code) { 1592 List classes = new Vector(); 1593 classes.add(new Property(new String (new char[]{code}))); 1594 for (int ci = CATEGORY_NAMES.indexOf(code); ci >= 0; ci = CATEGORY_NAMES.indexOf(code, ci + 1)) { 1595 int[] addRanges = CATEGORY_RANGES[ci / 2]; 1596 for (int i = 0; i < addRanges.length; i += 2) 1597 classes.add(new CharRange(addRanges[i], addRanges[i + 1])); 1598 } 1599 if (code == 'P') 1600 classes.add(makeCharClass(CATEGORY_Pi + CATEGORY_Pf)); 1601 if (code == 'L') { 1602 classes.add(new SingleChar(UNICODE_3_1_ADD_Ll)); 1603 classes.add(new SingleChar(UNICODE_3_1_ADD_Lu)); 1604 } 1605 if (code == 'C') { 1606 classes.add(new Subtraction(new Property("Cn"), 1608 new Union(new CharClass[]{new SingleChar(UNICODE_3_1_ADD_Lu), 1609 new SingleChar(UNICODE_3_1_ADD_Ll)}))); 1610 List assignedRanges = new Vector(); 1611 for (int i = 0; i < CATEGORY_RANGES.length; i++) 1612 for (int j = 0; j < CATEGORY_RANGES[i].length; j += 2) 1613 assignedRanges.add(new CharRange(CATEGORY_RANGES[i][j], 1614 CATEGORY_RANGES[i][j + 1])); 1615 classes.add(new Subtraction(new CharRange(NONBMP_MIN, NONBMP_MAX), 1616 new Union(assignedRanges))); 1617 } 1618 if (classes.size() == 1) 1619 return (CharClass)classes.get(0); 1620 return new Union(classes); 1621 } 1622 1623 private static CharClass computeSubCategoryCharClass(String name) { 1624 CharClass base = new Property(name); 1625 int sci = CATEGORY_NAMES.indexOf(name); 1626 if (sci < 0) { 1627 if (name.equals("Cn")) { 1628 List assignedRanges = new Vector(); 1630 assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Lu)); 1631 assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Ll)); 1632 for (int i = 0; i < CATEGORY_RANGES.length; i++) 1633 for (int j = 0; j < CATEGORY_RANGES[i].length; j += 2) 1634 assignedRanges.add(new CharRange(CATEGORY_RANGES[i][j], 1635 CATEGORY_RANGES[i][j + 1])); 1636 return new Subtraction(new Union(new CharClass[]{base, new CharRange(NONBMP_MIN, NONBMP_MAX)}), 1637 new Union(assignedRanges)); 1638 } 1639 if (name.equals("Pi")) 1640 return makeCharClass(CATEGORY_Pi); 1641 if (name.equals("Pf")) 1642 return makeCharClass(CATEGORY_Pf); 1643 return base; 1644 } 1645 List classes = new Vector(); 1646 classes.add(base); 1647 int[] addRanges = CATEGORY_RANGES[sci / 2]; 1648 for (int i = 0; i < addRanges.length; i += 2) 1649 classes.add(new CharRange(addRanges[i], addRanges[i + 1])); 1650 if (name.equals("Lu")) 1651 classes.add(new SingleChar(UNICODE_3_1_ADD_Lu)); 1652 else if (name.equals("Ll")) 1653 classes.add(new SingleChar(UNICODE_3_1_ADD_Ll)); 1654 else if (name.equals("Nl")) 1655 classes.add(new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN, UNICODE_3_1_CHANGE_No_to_Nl_MAX)); 1656 else if (name.equals("No")) 1657 return new Subtraction(new Union(classes), 1658 new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN, 1659 UNICODE_3_1_CHANGE_No_to_Nl_MAX)); 1660 return new Union(classes); 1661 } 1662 1663 private static CharClass makeCharClass(String members) { 1664 List list = new Vector(); 1665 for (int i = 0, len = members.length(); i < len; i++) 1666 list.add(new SingleChar(members.charAt(i))); 1667 return new Union(list); 1668 } 1669 1670 public static void main(String [] args) throws RegexSyntaxException { 1671 String s = translate(args[0], args[1].equals("xpath")); 1672 for (int i = 0, len = s.length(); i < len; i++) { 1673 char c = s.charAt(i); 1674 if (c >= 0x20 && c <= 0x7e) 1675 System.err.print(c); 1676 else { 1677 System.err.print("\\u"); 1678 for (int shift = 12; shift >= 0; shift -= 4) 1679 System.err.print("0123456789ABCDEF".charAt((c >> shift) & 0xF)); 1680 } 1681 } 1682 System.err.println(); 1683 } 1684 1685 1686 1688 1689} 1690 1691 1709 | Popular Tags |