1 57 58 package org.enhydra.apache.xerces.utils.regex; 59 60 import java.util.Hashtable ; 61 import java.util.Vector ; 62 63 66 class Token implements java.io.Serializable { 67 static final boolean COUNTTOKENS = true; 68 static int tokens = 0; 69 70 static final int CHAR = 0; static final int DOT = 11; static final int CONCAT = 1; static final int UNION = 2; static final int CLOSURE = 3; static final int RANGE = 4; static final int NRANGE = 5; static final int PAREN = 6; static final int EMPTY = 7; static final int ANCHOR = 8; static final int NONGREEDYCLOSURE = 9; static final int STRING = 10; static final int BACKREFERENCE = 12; static final int LOOKAHEAD = 20; static final int NEGATIVELOOKAHEAD = 21; static final int LOOKBEHIND = 22; static final int NEGATIVELOOKBEHIND = 23; static final int INDEPENDENT = 24; static final int MODIFIERGROUP = 25; static final int CONDITION = 26; 91 static final int UTF16_MAX = 0x10ffff; 92 93 int type; 94 95 static protected Token token_dot; 96 static protected Token token_0to9; 97 static protected Token token_wordchars; 98 static protected Token token_not_0to9; 99 static protected Token token_not_wordchars; 100 static protected Token token_spaces; 101 static protected Token token_not_spaces; 102 static protected Token token_empty; 103 static protected Token token_linebeginning; 104 static protected Token token_linebeginning2; 105 static protected Token token_lineend; 106 static protected Token token_stringbeginning; 107 static protected Token token_stringend; 108 static protected Token token_stringend2; 109 static protected Token token_wordedge; 110 static protected Token token_not_wordedge; 111 static protected Token token_wordbeginning; 112 static protected Token token_wordend; 113 static { 114 Token.token_empty = new Token(Token.EMPTY); 115 116 Token.token_linebeginning = Token.createAnchor('^'); 117 Token.token_linebeginning2 = Token.createAnchor('@'); 118 Token.token_lineend = Token.createAnchor('$'); 119 Token.token_stringbeginning = Token.createAnchor('A'); 120 Token.token_stringend = Token.createAnchor('z'); 121 Token.token_stringend2 = Token.createAnchor('Z'); 122 Token.token_wordedge = Token.createAnchor('b'); 123 Token.token_not_wordedge = Token.createAnchor('B'); 124 Token.token_wordbeginning = Token.createAnchor('<'); 125 Token.token_wordend = Token.createAnchor('>'); 126 127 Token.token_dot = new Token(Token.DOT); 128 129 Token.token_0to9 = Token.createRange(); 130 Token.token_0to9.addRange('0', '9'); 131 Token.token_wordchars = Token.createRange(); 132 Token.token_wordchars.addRange('0', '9'); 133 Token.token_wordchars.addRange('A', 'Z'); 134 Token.token_wordchars.addRange('_', '_'); 135 Token.token_wordchars.addRange('a', 'z'); 136 Token.token_spaces = Token.createRange(); 137 Token.token_spaces.addRange('\t', '\t'); 138 Token.token_spaces.addRange('\n', '\n'); 139 Token.token_spaces.addRange('\f', '\f'); 140 Token.token_spaces.addRange('\r', '\r'); 141 Token.token_spaces.addRange(' ', ' '); 142 143 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9); 144 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars); 145 Token.token_not_spaces = Token.complementRanges(Token.token_spaces); 146 } 147 148 static Token.ParenToken createLook(int type, Token child) { 149 if (COUNTTOKENS) Token.tokens ++; 150 return new Token.ParenToken(type, child, 0); 151 } 152 static Token.ParenToken createParen(Token child, int pnumber) { 153 if (COUNTTOKENS) Token.tokens ++; 154 return new Token.ParenToken(Token.PAREN, child, pnumber); 155 } 156 static Token.ClosureToken createClosure(Token tok) { 157 if (COUNTTOKENS) Token.tokens ++; 158 return new Token.ClosureToken(Token.CLOSURE, tok); 159 } 160 static Token.ClosureToken createNGClosure(Token tok) { 161 if (COUNTTOKENS) Token.tokens ++; 162 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok); 163 } 164 static Token.ConcatToken createConcat(Token tok1, Token tok2) { 165 if (COUNTTOKENS) Token.tokens ++; 166 return new Token.ConcatToken(tok1, tok2); 167 } 168 static Token.UnionToken createConcat() { 169 if (COUNTTOKENS) Token.tokens ++; 170 return new Token.UnionToken(Token.CONCAT); } 172 static Token.UnionToken createUnion() { 173 if (COUNTTOKENS) Token.tokens ++; 174 return new Token.UnionToken(Token.UNION); 175 } 176 static Token createEmpty() { 177 return Token.token_empty; 178 } 179 static RangeToken createRange() { 180 if (COUNTTOKENS) Token.tokens ++; 181 return new RangeToken(Token.RANGE); 182 } 183 static RangeToken createNRange() { 184 if (COUNTTOKENS) Token.tokens ++; 185 return new RangeToken(Token.NRANGE); 186 } 187 static Token.CharToken createChar(int ch) { 188 if (COUNTTOKENS) Token.tokens ++; 189 return new Token.CharToken(Token.CHAR, ch); 190 } 191 static private Token.CharToken createAnchor(int ch) { 192 if (COUNTTOKENS) Token.tokens ++; 193 return new Token.CharToken(Token.ANCHOR, ch); 194 } 195 static Token.StringToken createBackReference(int refno) { 196 if (COUNTTOKENS) Token.tokens ++; 197 return new Token.StringToken(Token.BACKREFERENCE, null, refno); 198 } 199 static Token.StringToken createString(String str) { 200 if (COUNTTOKENS) Token.tokens ++; 201 return new Token.StringToken(Token.STRING, str, 0); 202 } 203 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) { 204 if (COUNTTOKENS) Token.tokens ++; 205 return new Token.ModifierToken(child, add, mask); 206 } 207 static Token.ConditionToken createCondition(int refno, Token condition, 208 Token yespat, Token nopat) { 209 if (COUNTTOKENS) Token.tokens ++; 210 return new Token.ConditionToken(refno, condition, yespat, nopat); 211 } 212 213 protected Token(int type) { 214 this.type = type; 215 } 216 217 220 int size() { 221 return 0; 222 } 223 Token getChild(int index) { 224 return null; 225 } 226 void addChild(Token tok) { 227 throw new RuntimeException ("Not supported."); 228 } 229 230 protected void addRange(int start, int end) { 232 throw new RuntimeException ("Not supported."); 233 } 234 protected void sortRanges() { 235 throw new RuntimeException ("Not supported."); 236 } 237 protected void compactRanges() { 238 throw new RuntimeException ("Not supported."); 239 } 240 protected void mergeRanges(Token tok) { 241 throw new RuntimeException ("Not supported."); 242 } 243 protected void subtractRanges(Token tok) { 244 throw new RuntimeException ("Not supported."); 245 } 246 protected void intersectRanges(Token tok) { 247 throw new RuntimeException ("Not supported."); 248 } 249 static Token complementRanges(Token tok) { 250 return RangeToken.complementRanges(tok); 251 } 252 253 254 void setMin(int min) { } 256 void setMax(int max) { } 258 int getMin() { return -1; 260 } 261 int getMax() { return -1; 263 } 264 int getReferenceNumber() { return 0; 266 } 267 String getString() { return null; 269 } 270 271 int getParenNumber() { 272 return 0; 273 } 274 int getChar() { 275 return -1; 276 } 277 278 public String toString() { 279 return this.toString(0); 280 } 281 public String toString(int options) { 282 return this.type == Token.DOT ? "." : ""; 283 } 284 285 288 final int getMinLength() { 289 switch (this.type) { 290 case CONCAT: 291 int sum = 0; 292 for (int i = 0; i < this.size(); i ++) 293 sum += this.getChild(i).getMinLength(); 294 return sum; 295 296 case CONDITION: 297 case UNION: 298 if (this.size() == 0) 299 return 0; 300 int ret = this.getChild(0).getMinLength(); 301 for (int i = 1; i < this.size(); i ++) { 302 int min = this.getChild(i).getMinLength(); 303 if (min < ret) ret = min; 304 } 305 return ret; 306 307 case CLOSURE: 308 case NONGREEDYCLOSURE: 309 if (this.getMin() >= 0) 310 return this.getMin() * this.getChild(0).getMinLength(); 311 return 0; 312 313 case EMPTY: 314 case ANCHOR: 315 return 0; 316 317 case DOT: 318 case CHAR: 319 case RANGE: 320 case NRANGE: 321 return 1; 322 323 case INDEPENDENT: 324 case PAREN: 325 case MODIFIERGROUP: 326 return this.getChild(0).getMinLength(); 327 328 case BACKREFERENCE: 329 return 0; 331 case STRING: 332 return this.getString().length(); 333 334 case LOOKAHEAD: 335 case NEGATIVELOOKAHEAD: 336 case LOOKBEHIND: 337 case NEGATIVELOOKBEHIND: 338 return 0; 340 default: 341 throw new RuntimeException ("Token#getMinLength(): Invalid Type: "+this.type); 342 } 343 } 344 345 final int getMaxLength() { 346 switch (this.type) { 347 case CONCAT: 348 int sum = 0; 349 for (int i = 0; i < this.size(); i ++) { 350 int d = this.getChild(i).getMaxLength(); 351 if (d < 0) return -1; 352 sum += d; 353 } 354 return sum; 355 356 case CONDITION: 357 case UNION: 358 if (this.size() == 0) 359 return 0; 360 int ret = this.getChild(0).getMaxLength(); 361 for (int i = 1; ret >= 0 && i < this.size(); i ++) { 362 int max = this.getChild(i).getMaxLength(); 363 if (max < 0) { ret = -1; 365 break; 366 } 367 if (max > ret) ret = max; 368 } 369 return ret; 370 371 case CLOSURE: 372 case NONGREEDYCLOSURE: 373 if (this.getMax() >= 0) 374 return this.getMax() * this.getChild(0).getMaxLength(); 377 return -1; 378 379 case EMPTY: 380 case ANCHOR: 381 return 0; 382 383 case CHAR: 384 return 1; 385 case DOT: 386 case RANGE: 387 case NRANGE: 388 return 2; 389 390 case INDEPENDENT: 391 case PAREN: 392 case MODIFIERGROUP: 393 return this.getChild(0).getMaxLength(); 394 395 case BACKREFERENCE: 396 return -1; 398 case STRING: 399 return this.getString().length(); 400 401 case LOOKAHEAD: 402 case NEGATIVELOOKAHEAD: 403 case LOOKBEHIND: 404 case NEGATIVELOOKBEHIND: 405 return 0; 407 default: 408 throw new RuntimeException ("Token#getMaxLength(): Invalid Type: "+this.type); 409 } 410 } 411 412 static final int FC_CONTINUE = 0; 413 static final int FC_TERMINAL = 1; 414 static final int FC_ANY = 2; 415 private static final boolean isSet(int options, int flag) { 416 return (options & flag) == flag; 417 } 418 final int analyzeFirstCharacter(RangeToken result, int options) { 419 switch (this.type) { 420 case CONCAT: 421 int ret = FC_CONTINUE; 422 for (int i = 0; i < this.size(); i ++) 423 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) 424 break; 425 return ret; 426 427 case UNION: 428 if (this.size() == 0) 429 return FC_CONTINUE; 430 435 int ret2 = FC_CONTINUE; 436 boolean hasEmpty = false; 437 for (int i = 0; i < this.size(); i ++) { 438 ret2 = this.getChild(i).analyzeFirstCharacter(result, options); 439 if (ret2 == FC_ANY) 440 break; 441 else if (ret2 == FC_CONTINUE) 442 hasEmpty = true; 443 } 444 return hasEmpty ? FC_CONTINUE : ret2; 445 446 case CONDITION: 447 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options); 448 if (this.size() == 1) return FC_CONTINUE; 449 if (ret3 == FC_ANY) return ret3; 450 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options); 451 if (ret4 == FC_ANY) return ret4; 452 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL; 453 454 case CLOSURE: 455 case NONGREEDYCLOSURE: 456 this.getChild(0).analyzeFirstCharacter(result, options); 457 return FC_CONTINUE; 458 459 case EMPTY: 460 case ANCHOR: 461 return FC_CONTINUE; 462 463 case CHAR: 464 int ch = this.getChar(); 465 result.addRange(ch, ch); 466 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 467 ch = Character.toUpperCase((char)ch); 468 result.addRange(ch, ch); 469 ch = Character.toLowerCase((char)ch); 470 result.addRange(ch, ch); 471 } 472 return FC_TERMINAL; 473 474 case DOT: if (isSet(options, RegularExpression.SINGLE_LINE)) { 476 return FC_CONTINUE; } else { 478 return FC_CONTINUE; 479 487 } 488 489 case RANGE: 490 if (isSet(options, RegularExpression.IGNORE_CASE)) { 491 result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken()); 492 } else { 493 result.mergeRanges(this); 494 } 495 return FC_TERMINAL; 496 497 case NRANGE: if (isSet(options, RegularExpression.IGNORE_CASE)) { 499 result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken())); 500 } else { 501 result.mergeRanges(Token.complementRanges(this)); 502 } 503 return FC_TERMINAL; 504 505 case INDEPENDENT: 506 case PAREN: 507 return this.getChild(0).analyzeFirstCharacter(result, options); 508 509 case MODIFIERGROUP: 510 options |= ((ModifierToken)this).getOptions(); 511 options &= ~((ModifierToken)this).getOptionsMask(); 512 return this.getChild(0).analyzeFirstCharacter(result, options); 513 514 case BACKREFERENCE: 515 result.addRange(0, UTF16_MAX); return FC_ANY; 517 518 case STRING: 519 int cha = this.getString().charAt(0); 520 int ch2; 521 if (REUtil.isHighSurrogate(cha) 522 && this.getString().length() >= 2 523 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) 524 cha = REUtil.composeFromSurrogates(cha, ch2); 525 result.addRange(cha, cha); 526 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 527 cha = Character.toUpperCase((char)cha); 528 result.addRange(cha, cha); 529 cha = Character.toLowerCase((char)cha); 530 result.addRange(cha, cha); 531 } 532 return FC_TERMINAL; 533 534 case LOOKAHEAD: 535 case NEGATIVELOOKAHEAD: 536 case LOOKBEHIND: 537 case NEGATIVELOOKBEHIND: 538 return FC_CONTINUE; 539 540 default: 541 throw new RuntimeException ("Token#analyzeHeadCharacter(): Invalid Type: "+this.type); 542 } 543 } 544 545 private final boolean isShorterThan(Token tok) { 546 if (tok == null) return false; 547 557 int mylength; 558 if (this.type == STRING) mylength = this.getString().length(); 559 else throw new RuntimeException ("Internal Error: Illegal type: "+this.type); 560 int otherlength; 561 if (tok.type == STRING) otherlength = tok.getString().length(); 562 else throw new RuntimeException ("Internal Error: Illegal type: "+tok.type); 563 return mylength < otherlength; 564 } 565 566 static class FixedStringContainer { 567 Token token = null; 568 int options = 0; 569 FixedStringContainer() { 570 } 571 } 572 573 final void findFixedString(FixedStringContainer container, int options) { 574 switch (this.type) { 575 case CONCAT: 576 Token prevToken = null; 577 int prevOptions = 0; 578 for (int i = 0; i < this.size(); i ++) { 579 this.getChild(i).findFixedString(container, options); 580 if (prevToken == null || prevToken.isShorterThan(container.token)) { 581 prevToken = container.token; 582 prevOptions = container.options; 583 } 584 } 585 container.token = prevToken; 586 container.options = prevOptions; 587 return; 588 589 case UNION: 590 case CLOSURE: 591 case NONGREEDYCLOSURE: 592 case EMPTY: 593 case ANCHOR: 594 case RANGE: 595 case DOT: 596 case NRANGE: 597 case BACKREFERENCE: 598 case LOOKAHEAD: 599 case NEGATIVELOOKAHEAD: 600 case LOOKBEHIND: 601 case NEGATIVELOOKBEHIND: 602 case CONDITION: 603 container.token = null; 604 return; 605 606 case CHAR: container.token = null; return; 610 case STRING: 611 container.token = this; 612 container.options = options; 613 return; 614 615 case INDEPENDENT: 616 case PAREN: 617 this.getChild(0).findFixedString(container, options); 618 return; 619 620 case MODIFIERGROUP: 621 options |= ((ModifierToken)this).getOptions(); 622 options &= ~((ModifierToken)this).getOptionsMask(); 623 this.getChild(0).findFixedString(container, options); 624 return; 625 626 default: 627 throw new RuntimeException ("Token#findFixedString(): Invalid Type: "+this.type); 628 } 629 } 630 631 boolean match(int ch) { 632 throw new RuntimeException ("NFAArrow#match(): Internal error: "+this.type); 633 } 634 635 static protected Hashtable categories = new Hashtable (); 637 static protected Hashtable categories2 = null; 638 static final String [] categoryNames = { 639 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", 640 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", 641 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", "Pi", "Pf", "L", "M", "N", "Z", "C", "P", "S", }; 645 646 static final int CHAR_INIT_QUOTE = 29; static final int CHAR_FINAL_QUOTE = 30; static final int CHAR_LETTER = 31; 650 static final int CHAR_MARK = 32; 651 static final int CHAR_NUMBER = 33; 652 static final int CHAR_SEPARATOR = 34; 653 static final int CHAR_OTHER = 35; 654 static final int CHAR_PUNCTUATION = 36; 655 static final int CHAR_SYMBOL = 37; 656 657 static final String [] blockNames = { 659 "Basic Latin", 660 "Latin-1 Supplement", 661 "Latin Extended-A", 662 "Latin Extended-B", 663 "IPA Extensions", 664 "Spacing Modifier Letters", 665 "Combining Diacritical Marks", 666 "Greek", 667 "Cyrillic", 668 "Armenian", 669 "Hebrew", 670 "Arabic", 671 "Syriac", 672 "Thaana", 673 "Devanagari", 674 "Bengali", 675 "Gurmukhi", 676 "Gujarati", 677 "Oriya", 678 "Tamil", 679 "Telugu", 680 "Kannada", 681 "Malayalam", 682 "Sinhala", 683 "Thai", 684 "Lao", 685 "Tibetan", 686 "Myanmar", 687 "Georgian", 688 "Hangul Jamo", 689 "Ethiopic", 690 "Cherokee", 691 "Unified Canadian Aboriginal Syllabics", 692 "Ogham", 693 "Runic", 694 "Khmer", 695 "Mongolian", 696 "Latin Extended Additional", 697 "Greek Extended", 698 "General Punctuation", 699 "Superscripts and Subscripts", 700 "Currency Symbols", 701 "Combining Marks for Symbols", 702 "Letterlike Symbols", 703 "Number Forms", 704 "Arrows", 705 "Mathematical Operators", 706 "Miscellaneous Technical", 707 "Control Pictures", 708 "Optical Character Recognition", 709 "Enclosed Alphanumerics", 710 "Box Drawing", 711 "Block Elements", 712 "Geometric Shapes", 713 "Miscellaneous Symbols", 714 "Dingbats", 715 "Braille Patterns", 716 "CJK Radicals Supplement", 717 "Kangxi Radicals", 718 "Ideographic Description Characters", 719 "CJK Symbols and Punctuation", 720 "Hiragana", 721 "Katakana", 722 "Bopomofo", 723 "Hangul Compatibility Jamo", 724 "Kanbun", 725 "Bopomofo Extended", 726 "Enclosed CJK Letters and Months", 727 "CJK Compatibility", 728 "CJK Unified Ideographs Extension A", 729 "CJK Unified Ideographs", 730 "Yi Syllables", 731 "Yi Radicals", 732 "Hangul Syllables", 733 "High Surrogates", 734 "High Private Use Surrogates", 735 "Low Surrogates", 736 "Private Use", 737 "CJK Compatibility Ideographs", 738 "Alphabetic Presentation Forms", 739 "Arabic Presentation Forms-A", 740 "Combining Half Marks", 741 "CJK Compatibility Forms", 742 "Small Form Variants", 743 "Arabic Presentation Forms-B", 744 "Specials", 745 "Halfwidth and Fullwidth Forms", 746 "Old Italic", 748 "Gothic", 749 "Deseret", 750 "Byzantine Musical Symbols", 751 "Musical Symbols", 752 "Mathematical Alphanumeric Symbols", 753 "CJK Unified Ideographs Extension B", 754 "CJK Compatibility Ideographs Supplement", 755 "Tags", 756 758 }; 759 static final String blockRanges = 764 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F" 765 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF" 766 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF" 767 +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF" 768 +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF" 769 +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF" 770 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF" 771 +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F" 772 +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF" 773 +"\uAC00\uD7A3\uD800\uDB7F\uDB80\uDBFF\uDC00\uDFFF\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF" 774 +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF\u10300\u1032F\u10330\u1034F" 775 +"\u10400\u1044F\u1D000\u1D0FFs\u1D100\u1D1FF\u1D400\u1D7FF\u20000\u2A6D6\u2F800\u2FA1F\uE0000\uE007F"; 776 777 static protected RangeToken getRange(String name, boolean positive) { 778 if (Token.categories.size() == 0) { 779 synchronized (Token.categories) { 780 Token[] ranges = new Token[Token.categoryNames.length]; 781 for (int i = 0; i < ranges.length; i ++) { 782 ranges[i] = Token.createRange(); 783 } 784 int type; 785 for (int i = 0; i < 0x10000; i ++) { 786 type = Character.getType((char)i); 787 if (type == Character.START_PUNCTUATION || 788 type == Character.END_PUNCTUATION) { 789 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || 791 i == 0x201F || i == 0x2039) { 792 type = CHAR_INIT_QUOTE; 793 } 794 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { 796 type = CHAR_FINAL_QUOTE; 797 } 798 } 799 ranges[type].addRange(i, i); 800 switch (type) { 801 case Character.UPPERCASE_LETTER: 802 case Character.LOWERCASE_LETTER: 803 case Character.TITLECASE_LETTER: 804 case Character.MODIFIER_LETTER: 805 case Character.OTHER_LETTER: 806 type = CHAR_LETTER; 807 break; 808 case Character.NON_SPACING_MARK: 809 case Character.COMBINING_SPACING_MARK: 810 case Character.ENCLOSING_MARK: 811 type = CHAR_MARK; 812 break; 813 case Character.DECIMAL_DIGIT_NUMBER: 814 case Character.LETTER_NUMBER: 815 case Character.OTHER_NUMBER: 816 type = CHAR_NUMBER; 817 break; 818 case Character.SPACE_SEPARATOR: 819 case Character.LINE_SEPARATOR: 820 case Character.PARAGRAPH_SEPARATOR: 821 type = CHAR_SEPARATOR; 822 break; 823 case Character.CONTROL: 824 case Character.FORMAT: 825 case Character.SURROGATE: 826 case Character.PRIVATE_USE: 827 case Character.UNASSIGNED: 828 type = CHAR_OTHER; 829 break; 830 case Character.CONNECTOR_PUNCTUATION: 831 case Character.DASH_PUNCTUATION: 832 case Character.START_PUNCTUATION: 833 case Character.END_PUNCTUATION: 834 case CHAR_INIT_QUOTE: 835 case CHAR_FINAL_QUOTE: 836 case Character.OTHER_PUNCTUATION: 837 type = CHAR_PUNCTUATION; 838 break; 839 case Character.MATH_SYMBOL: 840 case Character.CURRENCY_SYMBOL: 841 case Character.MODIFIER_SYMBOL: 842 case Character.OTHER_SYMBOL: 843 type = CHAR_SYMBOL; 844 break; 845 default: 846 throw new RuntimeException ("org.enhydra.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type); 847 } 848 ranges[type].addRange(i, i); 849 } ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); 851 852 Token.categories2 = new Hashtable (); 853 for (int i = 0; i < ranges.length; i ++) { 854 if (Token.categoryNames[i] != null) { 855 if (i == Character.UNASSIGNED) { ranges[i].addRange(0x10000, Token.UTF16_MAX); 857 } 858 Token.categories.put(Token.categoryNames[i], ranges[i]); 859 Token.categories2.put(Token.categoryNames[i], 860 Token.complementRanges(ranges[i])); 861 } 862 } 863 StringBuffer buffer = new StringBuffer (50); 867 int location = 0; 868 for (int i = 0; i < Token.blockNames.length; i ++) { 869 Token r1 = Token.createRange(); 870 location = i*2; 871 int rstart = Token.blockRanges.charAt(location); 872 int rend = Token.blockRanges.charAt(location+1); 873 String n = Token.blockNames[i]; 874 r1.addRange(rstart, rend); 878 if (n.equals("Specials")) 879 r1.addRange(0xfff0, 0xfffd); 880 if (n.equals("Private Use")) { 881 r1.addRange(0xF0000,0xFFFFD); 882 r1.addRange(0x100000,0x10FFFD); 883 } 884 Token.categories.put(n, r1); 885 Token.categories2.put(n, Token.complementRanges(r1)); 886 buffer.setLength(0); 887 buffer.append("Is"); 888 if (n.indexOf(' ') >= 0) { 889 for (int ci = 0; ci < n.length(); ci ++) 890 if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci)); 891 } 892 else { 893 buffer.append(n); 894 } 895 Token.setAlias(buffer.toString(), n, true); 896 } 897 898 902 911 912 977 } } RangeToken tok = positive ? (RangeToken)Token.categories.get(name) 980 : (RangeToken)Token.categories2.get(name); 981 if (tok == null) System.out.println(name); 982 return tok; 983 } 984 985 private static void setAlias(String newName, String name, boolean positive) { 986 Token t1 = (Token)Token.categories.get(name); 987 Token t2 = (Token)Token.categories2.get(name); 988 if (positive) { 989 Token.categories.put(newName, t1); 990 Token.categories2.put(newName, t2); 991 } else { 992 Token.categories2.put(newName, t1); 993 Token.categories.put(newName, t2); 994 } 995 } 996 997 999 static final String viramaString = 1000 "\u094D" +"\u09CD" +"\u0A4D" +"\u0ACD" +"\u0B4D" +"\u0BCD" +"\u0C4D" +"\u0CCD" +"\u0D4D" +"\u0E3A" +"\u0F84"; 1012 static private Token token_grapheme = null; 1013 static synchronized protected Token getGraphemePattern() { 1014 if (Token.token_grapheme != null) 1015 return Token.token_grapheme; 1016 1017 Token base_char = Token.createRange(); base_char.mergeRanges(Token.getRange("ASSIGNED", true)); 1019 base_char.subtractRanges(Token.getRange("M", true)); 1020 base_char.subtractRanges(Token.getRange("C", true)); 1021 1022 Token virama = Token.createRange(); 1023 for (int i = 0; i < Token.viramaString.length(); i ++) { 1024 int ch = viramaString.charAt(i); 1025 virama.addRange(i, i); 1026 } 1027 1028 Token combiner_wo_virama = Token.createRange(); 1029 combiner_wo_virama.mergeRanges(Token.getRange("M", true)); 1030 combiner_wo_virama.addRange(0x1160, 0x11ff); combiner_wo_virama.addRange(0xff9e, 0xff9f); 1033 Token left = Token.createUnion(); left.addChild(base_char); 1035 left.addChild(Token.token_empty); 1036 1037 Token foo = Token.createUnion(); 1038 foo.addChild(Token.createConcat(virama, Token.getRange("L", true))); 1039 foo.addChild(combiner_wo_virama); 1040 1041 foo = Token.createClosure(foo); 1042 1043 foo = Token.createConcat(left, foo); 1044 1045 Token.token_grapheme = foo; 1046 return Token.token_grapheme; 1047 } 1048 1049 1052 static private Token token_ccs = null; 1053 static synchronized protected Token getCombiningCharacterSequence() { 1054 if (Token.token_ccs != null) 1055 return Token.token_ccs; 1056 1057 Token foo = Token.createClosure(Token.getRange("M", true)); foo = Token.createConcat(Token.getRange("M", false), foo); Token.token_ccs = foo; 1060 return Token.token_ccs; 1061 } 1062 1063 1065 1069 static class StringToken extends Token implements java.io.Serializable { 1070 String string; 1071 int refNumber; 1072 1073 StringToken(int type, String str, int n) { 1074 super(type); 1075 this.string = str; 1076 this.refNumber = n; 1077 } 1078 1079 int getReferenceNumber() { return this.refNumber; 1081 } 1082 String getString() { return this.string; 1084 } 1085 1086 public String toString(int options) { 1087 if (this.type == BACKREFERENCE) 1088 return "\\"+this.refNumber; 1089 else 1090 return REUtil.quoteMeta(this.string); 1091 } 1092 } 1093 1094 1097 static class ConcatToken extends Token implements java.io.Serializable { 1098 Token child; 1099 Token child2; 1100 1101 ConcatToken(Token t1, Token t2) { 1102 super(Token.CONCAT); 1103 this.child = t1; 1104 this.child2 = t2; 1105 } 1106 1107 int size() { 1108 return 2; 1109 } 1110 Token getChild(int index) { 1111 return index == 0 ? this.child : this.child2; 1112 } 1113 1114 public String toString(int options) { 1115 String ret; 1116 if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) { 1117 ret = this.child.toString(options)+"+"; 1118 } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) { 1119 ret = this.child.toString(options)+"+?"; 1120 } else 1121 ret = this.child.toString(options)+this.child2.toString(options); 1122 return ret; 1123 } 1124 } 1125 1126 1129 static class CharToken extends Token implements java.io.Serializable { 1130 int chardata; 1131 1132 CharToken(int type, int ch) { 1133 super(type); 1134 this.chardata = ch; 1135 } 1136 1137 int getChar() { 1138 return this.chardata; 1139 } 1140 1141 public String toString(int options) { 1142 String ret; 1143 switch (this.type) { 1144 case CHAR: 1145 switch (this.chardata) { 1146 case '|': case '*': case '+': case '?': 1147 case '(': case ')': case '.': case '[': 1148 case '{': case '\\': 1149 ret = "\\"+(char)this.chardata; 1150 break; 1151 case '\f': ret = "\\f"; break; 1152 case '\n': ret = "\\n"; break; 1153 case '\r': ret = "\\r"; break; 1154 case '\t': ret = "\\t"; break; 1155 case 0x1b: ret = "\\e"; break; 1156 default: 1158 if (this.chardata >= 0x10000) { 1159 String pre = "0"+Integer.toHexString(this.chardata); 1160 ret = "\\v"+pre.substring(pre.length()-6, pre.length()); 1161 } else 1162 ret = ""+(char)this.chardata; 1163 } 1164 break; 1165 1166 case ANCHOR: 1167 if (this == Token.token_linebeginning || this == Token.token_lineend) 1168 ret = ""+(char)this.chardata; 1169 else 1170 ret = "\\"+(char)this.chardata; 1171 break; 1172 1173 default: 1174 ret = null; 1175 } 1176 return ret; 1177 } 1178 1179 boolean match(int ch) { 1180 if (this.type == CHAR) { 1181 return ch == this.chardata; 1182 } else 1183 throw new RuntimeException ("NFAArrow#match(): Internal error: "+this.type); 1184 } 1185 } 1186 1187 1190 static class ClosureToken extends Token implements java.io.Serializable { 1191 int min; 1192 int max; 1193 Token child; 1194 1195 ClosureToken(int type, Token tok) { 1196 super(type); 1197 this.child = tok; 1198 this.setMin(-1); 1199 this.setMax(-1); 1200 } 1201 1202 int size() { 1203 return 1; 1204 } 1205 Token getChild(int index) { 1206 return this.child; 1207 } 1208 1209 final void setMin(int min) { 1210 this.min = min; 1211 } 1212 final void setMax(int max) { 1213 this.max = max; 1214 } 1215 final int getMin() { 1216 return this.min; 1217 } 1218 final int getMax() { 1219 return this.max; 1220 } 1221 1222 public String toString(int options) { 1223 String ret; 1224 if (this.type == CLOSURE) { 1225 if (this.getMin() < 0 && this.getMax() < 0) { 1226 ret = this.child.toString(options)+"*"; 1227 } else if (this.getMin() == this.getMax()) { 1228 ret = this.child.toString(options)+"{"+this.getMin()+"}"; 1229 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 1230 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}"; 1231 } else if (this.getMin() >= 0 && this.getMax() < 0) { 1232 ret = this.child.toString(options)+"{"+this.getMin()+",}"; 1233 } else 1234 throw new RuntimeException ("Token#toString(): CLOSURE " 1235 +this.getMin()+", "+this.getMax()); 1236 } else { 1237 if (this.getMin() < 0 && this.getMax() < 0) { 1238 ret = this.child.toString(options)+"*?"; 1239 } else if (this.getMin() == this.getMax()) { 1240 ret = this.child.toString(options)+"{"+this.getMin()+"}?"; 1241 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 1242 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?"; 1243 } else if (this.getMin() >= 0 && this.getMax() < 0) { 1244 ret = this.child.toString(options)+"{"+this.getMin()+",}?"; 1245 } else 1246 throw new RuntimeException ("Token#toString(): NONGREEDYCLOSURE " 1247 +this.getMin()+", "+this.getMax()); 1248 } 1249 return ret; 1250 } 1251 } 1252 1253 1256 static class ParenToken extends Token implements java.io.Serializable { 1257 Token child; 1258 int parennumber; 1259 1260 ParenToken(int type, Token tok, int paren) { 1261 super(type); 1262 this.child = tok; 1263 this.parennumber = paren; 1264 } 1265 1266 int size() { 1267 return 1; 1268 } 1269 Token getChild(int index) { 1270 return this.child; 1271 } 1272 1273 int getParenNumber() { 1274 return this.parennumber; 1275 } 1276 1277 public String toString(int options) { 1278 String ret = null; 1279 switch (this.type) { 1280 case PAREN: 1281 if (this.parennumber == 0) { 1282 ret = "(?:"+this.child.toString(options)+")"; 1283 } else { 1284 ret = "("+this.child.toString(options)+")"; 1285 } 1286 break; 1287 1288 case LOOKAHEAD: 1289 ret = "(?="+this.child.toString(options)+")"; 1290 break; 1291 case NEGATIVELOOKAHEAD: 1292 ret = "(?!"+this.child.toString(options)+")"; 1293 break; 1294 case LOOKBEHIND: 1295 ret = "(?<="+this.child.toString(options)+")"; 1296 break; 1297 case NEGATIVELOOKBEHIND: 1298 ret = "(?<!"+this.child.toString(options)+")"; 1299 break; 1300 case INDEPENDENT: 1301 ret = "(?>"+this.child.toString(options)+")"; 1302 break; 1303 } 1304 return ret; 1305 } 1306 } 1307 1308 1311 static class ConditionToken extends Token implements java.io.Serializable { 1312 int refNumber; 1313 Token condition; 1314 Token yes; 1315 Token no; 1316 ConditionToken(int refno, Token cond, Token yespat, Token nopat) { 1317 super(Token.CONDITION); 1318 this.refNumber = refno; 1319 this.condition = cond; 1320 this.yes = yespat; 1321 this.no = nopat; 1322 } 1323 int size() { 1324 return this.no == null ? 1 : 2; 1325 } 1326 Token getChild(int index) { 1327 if (index == 0) return this.yes; 1328 if (index == 1) return this.no; 1329 throw new RuntimeException ("Internal Error: "+index); 1330 } 1331 1332 public String toString(int options) { 1333 String ret; 1334 if (refNumber > 0) { 1335 ret = "(?("+refNumber+")"; 1336 } else if (this.condition.type == Token.ANCHOR) { 1337 ret = "(?("+this.condition+")"; 1338 } else { 1339 ret = "(?"+this.condition; 1340 } 1341 1342 if (this.no == null) { 1343 ret += this.yes+")"; 1344 } else { 1345 ret += this.yes+"|"+this.no+")"; 1346 } 1347 return ret; 1348 } 1349 } 1350 1351 1354 static class ModifierToken extends Token implements java.io.Serializable { 1355 Token child; 1356 int add; 1357 int mask; 1358 1359 ModifierToken(Token tok, int add, int mask) { 1360 super(Token.MODIFIERGROUP); 1361 this.child = tok; 1362 this.add = add; 1363 this.mask = mask; 1364 } 1365 1366 int size() { 1367 return 1; 1368 } 1369 Token getChild(int index) { 1370 return this.child; 1371 } 1372 1373 int getOptions() { 1374 return this.add; 1375 } 1376 int getOptionsMask() { 1377 return this.mask; 1378 } 1379 1380 public String toString(int options) { 1381 return "(?" 1382 +(this.add == 0 ? "" : REUtil.createOptionString(this.add)) 1383 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask)) 1384 +":" 1385 +this.child.toString(options) 1386 +")"; 1387 } 1388 } 1389 1390 1394 static class UnionToken extends Token implements java.io.Serializable { 1395 Vector children; 1396 1397 UnionToken(int type) { 1398 super(type); 1399 } 1400 1401 void addChild(Token tok) { 1402 if (tok == null) return; 1403 if (this.children == null) this.children = new Vector (); 1404 if (this.type == UNION) { 1405 this.children.addElement(tok); 1406 return; 1407 } 1408 if (tok.type == CONCAT) { 1410 for (int i = 0; i < tok.size(); i ++) 1411 this.addChild(tok.getChild(i)); return; 1413 } 1414 int size = this.children.size(); 1415 if (size == 0) { 1416 this.children.addElement(tok); 1417 return; 1418 } 1419 Token previous = (Token)this.children.elementAt(size-1); 1420 if (!((previous.type == CHAR || previous.type == STRING) 1421 && (tok.type == CHAR || tok.type == STRING))) { 1422 this.children.addElement(tok); 1423 return; 1424 } 1425 1426 1428 StringBuffer buffer; 1429 int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length()); 1430 if (previous.type == CHAR) { buffer = new StringBuffer (2 + nextMaxLength); 1432 int ch = previous.getChar(); 1433 if (ch >= 0x10000) 1434 buffer.append(REUtil.decomposeToSurrogates(ch)); 1435 else 1436 buffer.append((char)ch); 1437 previous = Token.createString(null); 1438 this.children.setElementAt(previous, size-1); 1439 } else { buffer = new StringBuffer (previous.getString().length() + nextMaxLength); 1441 buffer.append(previous.getString()); 1442 } 1443 1444 if (tok.type == CHAR) { 1445 int ch = tok.getChar(); 1446 if (ch >= 0x10000) 1447 buffer.append(REUtil.decomposeToSurrogates(ch)); 1448 else 1449 buffer.append((char)ch); 1450 } else { 1451 buffer.append(tok.getString()); 1452 } 1453 1454 ((StringToken)previous).string = new String (buffer); 1455 } 1456 1457 int size() { 1458 return this.children == null ? 0 : this.children.size(); 1459 } 1460 Token getChild(int index) { 1461 return (Token)this.children.elementAt(index); 1462 } 1463 1464 public String toString(int options) { 1465 String ret; 1466 if (this.type == CONCAT) { 1467 if (this.children.size() == 2) { 1468 Token ch = this.getChild(0); 1469 Token ch2 = this.getChild(1); 1470 if (ch2.type == CLOSURE && ch2.getChild(0) == ch) { 1471 ret = ch.toString(options)+"+"; 1472 } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) { 1473 ret = ch.toString(options)+"+?"; 1474 } else 1475 ret = ch.toString(options)+ch2.toString(options); 1476 } else { 1477 StringBuffer sb = new StringBuffer (); 1478 for (int i = 0; i < this.children.size(); i ++) { 1479 sb.append(((Token)this.children.elementAt(i)).toString(options)); 1480 } 1481 ret = new String (sb); 1482 } 1483 return ret; 1484 } 1485 if (this.children.size() == 2 && this.getChild(1).type == EMPTY) { 1486 ret = this.getChild(0).toString(options)+"?"; 1487 } else if (this.children.size() == 2 1488 && this.getChild(0).type == EMPTY) { 1489 ret = this.getChild(1).toString(options)+"??"; 1490 } else { 1491 StringBuffer sb = new StringBuffer (); 1492 sb.append(((Token)this.children.elementAt(0)).toString(options)); 1493 for (int i = 1; i < this.children.size(); i ++) { 1494 sb.append((char)'|'); 1495 sb.append(((Token)this.children.elementAt(i)).toString(options)); 1496 } 1497 ret = new String (sb); 1498 } 1499 return ret; 1500 } 1501 } 1502} 1503 | Popular Tags |