| 1 16 17 package org.apache.xerces.impl.xpath.regex; 18 19 import java.util.Vector ; 20 import java.util.Hashtable ; 21 22 29 class Token implements java.io.Serializable { 30 31 private static final long serialVersionUID = 4049923761862293040L; 32 33 static final boolean COUNTTOKENS = true; 34 static int tokens = 0; 35 36 static final int CHAR = 0; static final int DOT = 11; static final int CONCAT = 1; static final int UNION = 2; static final int CLOSURE = 3; static final int RANGE = 4; static final int NRANGE = 5; static final int PAREN = 6; static final int EMPTY = 7; static final int ANCHOR = 8; static final int NONGREEDYCLOSURE = 9; static final int STRING = 10; static final int BACKREFERENCE = 12; static final int LOOKAHEAD = 20; static final int NEGATIVELOOKAHEAD = 21; static final int LOOKBEHIND = 22; static final int NEGATIVELOOKBEHIND = 23; static final int INDEPENDENT = 24; static final int MODIFIERGROUP = 25; static final int CONDITION = 26; 57 static final int UTF16_MAX = 0x10ffff; 58 59 int type; 60 61 static Token token_dot; 62 static Token token_0to9; 63 static Token token_wordchars; 64 static Token token_not_0to9; 65 static Token token_not_wordchars; 66 static Token token_spaces; 67 static Token token_not_spaces; 68 static Token token_empty; 69 static Token token_linebeginning; 70 static Token token_linebeginning2; 71 static Token token_lineend; 72 static Token token_stringbeginning; 73 static Token token_stringend; 74 static Token token_stringend2; 75 static Token token_wordedge; 76 static Token token_not_wordedge; 77 static Token token_wordbeginning; 78 static Token token_wordend; 79 static { 80 Token.token_empty = new Token(Token.EMPTY); 81 82 Token.token_linebeginning = Token.createAnchor('^'); 83 Token.token_linebeginning2 = Token.createAnchor('@'); 84 Token.token_lineend = Token.createAnchor('$'); 85 Token.token_stringbeginning = Token.createAnchor('A'); 86 Token.token_stringend = Token.createAnchor('z'); 87 Token.token_stringend2 = Token.createAnchor('Z'); 88 Token.token_wordedge = Token.createAnchor('b'); 89 Token.token_not_wordedge = Token.createAnchor('B'); 90 Token.token_wordbeginning = Token.createAnchor('<'); 91 Token.token_wordend = Token.createAnchor('>'); 92 93 Token.token_dot = new Token(Token.DOT); 94 95 Token.token_0to9 = Token.createRange(); 96 Token.token_0to9.addRange('0', '9'); 97 Token.token_wordchars = Token.createRange(); 98 Token.token_wordchars.addRange('0', '9'); 99 Token.token_wordchars.addRange('A', 'Z'); 100 Token.token_wordchars.addRange('_', '_'); 101 Token.token_wordchars.addRange('a', 'z'); 102 Token.token_spaces = Token.createRange(); 103 Token.token_spaces.addRange('\t', '\t'); 104 Token.token_spaces.addRange('\n', '\n'); 105 Token.token_spaces.addRange('\f', '\f'); 106 Token.token_spaces.addRange('\r', '\r'); 107 Token.token_spaces.addRange(' ', ' '); 108 109 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9); 110 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars); 111 Token.token_not_spaces = Token.complementRanges(Token.token_spaces); 112 } 113 114 static Token.ParenToken createLook(int type, Token child) { 115 if (COUNTTOKENS) Token.tokens ++; 116 return new Token.ParenToken(type, child, 0); 117 } 118 static Token.ParenToken createParen(Token child, int pnumber) { 119 if (COUNTTOKENS) Token.tokens ++; 120 return new Token.ParenToken(Token.PAREN, child, pnumber); 121 } 122 static Token.ClosureToken createClosure(Token tok) { 123 if (COUNTTOKENS) Token.tokens ++; 124 return new Token.ClosureToken(Token.CLOSURE, tok); 125 } 126 static Token.ClosureToken createNGClosure(Token tok) { 127 if (COUNTTOKENS) Token.tokens ++; 128 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok); 129 } 130 static Token.ConcatToken createConcat(Token tok1, Token tok2) { 131 if (COUNTTOKENS) Token.tokens ++; 132 return new Token.ConcatToken(tok1, tok2); 133 } 134 static Token.UnionToken createConcat() { 135 if (COUNTTOKENS) Token.tokens ++; 136 return new Token.UnionToken(Token.CONCAT); } 138 static Token.UnionToken createUnion() { 139 if (COUNTTOKENS) Token.tokens ++; 140 return new Token.UnionToken(Token.UNION); 141 } 142 static Token createEmpty() { 143 return Token.token_empty; 144 } 145 static RangeToken createRange() { 146 if (COUNTTOKENS) Token.tokens ++; 147 return new RangeToken(Token.RANGE); 148 } 149 static RangeToken createNRange() { 150 if (COUNTTOKENS) Token.tokens ++; 151 return new RangeToken(Token.NRANGE); 152 } 153 static Token.CharToken createChar(int ch) { 154 if (COUNTTOKENS) Token.tokens ++; 155 return new Token.CharToken(Token.CHAR, ch); 156 } 157 static private Token.CharToken createAnchor(int ch) { 158 if (COUNTTOKENS) Token.tokens ++; 159 return new Token.CharToken(Token.ANCHOR, ch); 160 } 161 static Token.StringToken createBackReference(int refno) { 162 if (COUNTTOKENS) Token.tokens ++; 163 return new Token.StringToken(Token.BACKREFERENCE, null, refno); 164 } 165 static Token.StringToken createString(String str) { 166 if (COUNTTOKENS) Token.tokens ++; 167 return new Token.StringToken(Token.STRING, str, 0); 168 } 169 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) { 170 if (COUNTTOKENS) Token.tokens ++; 171 return new Token.ModifierToken(child, add, mask); 172 } 173 static Token.ConditionToken createCondition(int refno, Token condition, 174 Token yespat, Token nopat) { 175 if (COUNTTOKENS) Token.tokens ++; 176 return new Token.ConditionToken(refno, condition, yespat, nopat); 177 } 178 179 protected Token(int type) { 180 this.type = type; 181 } 182 183 186 int size() { 187 return 0; 188 } 189 Token getChild(int index) { 190 return null; 191 } 192 void addChild(Token tok) { 193 throw new RuntimeException ("Not supported."); 194 } 195 196 protected void addRange(int start, int end) { 198 throw new RuntimeException ("Not supported."); 199 } 200 protected void sortRanges() { 201 throw new RuntimeException ("Not supported."); 202 } 203 protected void compactRanges() { 204 throw new RuntimeException ("Not supported."); 205 } 206 protected void mergeRanges(Token tok) { 207 throw new RuntimeException ("Not supported."); 208 } 209 protected void subtractRanges(Token tok) { 210 throw new RuntimeException ("Not supported."); 211 } 212 protected void intersectRanges(Token tok) { 213 throw new RuntimeException ("Not supported."); 214 } 215 static Token complementRanges(Token tok) { 216 return RangeToken.complementRanges(tok); 217 } 218 219 220 void setMin(int min) { } 222 void setMax(int max) { } 224 int getMin() { return -1; 226 } 227 int getMax() { return -1; 229 } 230 int getReferenceNumber() { return 0; 232 } 233 String getString() { return null; 235 } 236 237 int getParenNumber() { 238 return 0; 239 } 240 int getChar() { 241 return -1; 242 } 243 244 public String toString() { 245 return this.toString(0); 246 } 247 public String toString(int options) { 248 return this.type == Token.DOT ? "." : ""; 249 } 250 251 254 final int getMinLength() { 255 switch (this.type) { 256 case CONCAT: 257 int sum = 0; 258 for (int i = 0; i < this.size(); i ++) 259 sum += this.getChild(i).getMinLength(); 260 return sum; 261 262 case CONDITION: 263 case UNION: 264 if (this.size() == 0) 265 return 0; 266 int ret = this.getChild(0).getMinLength(); 267 for (int i = 1; i < this.size(); i ++) { 268 int min = this.getChild(i).getMinLength(); 269 if (min < ret) ret = min; 270 } 271 return ret; 272 273 case CLOSURE: 274 case NONGREEDYCLOSURE: 275 if (this.getMin() >= 0) 276 return this.getMin() * this.getChild(0).getMinLength(); 277 return 0; 278 279 case EMPTY: 280 case ANCHOR: 281 return 0; 282 283 case DOT: 284 case CHAR: 285 case RANGE: 286 case NRANGE: 287 return 1; 288 289 case INDEPENDENT: 290 case PAREN: 291 case MODIFIERGROUP: 292 return this.getChild(0).getMinLength(); 293 294 case BACKREFERENCE: 295 return 0; 297 case STRING: 298 return this.getString().length(); 299 300 case LOOKAHEAD: 301 case NEGATIVELOOKAHEAD: 302 case LOOKBEHIND: 303 case NEGATIVELOOKBEHIND: 304 return 0; 306 default: 307 throw new RuntimeException ("Token#getMinLength(): Invalid Type: "+this.type); 308 } 309 } 310 311 final int getMaxLength() { 312 switch (this.type) { 313 case CONCAT: 314 int sum = 0; 315 for (int i = 0; i < this.size(); i ++) { 316 int d = this.getChild(i).getMaxLength(); 317 if (d < 0) return -1; 318 sum += d; 319 } 320 return sum; 321 322 case CONDITION: 323 case UNION: 324 if (this.size() == 0) 325 return 0; 326 int ret = this.getChild(0).getMaxLength(); 327 for (int i = 1; ret >= 0 && i < this.size(); i ++) { 328 int max = this.getChild(i).getMaxLength(); 329 if (max < 0) { ret = -1; 331 break; 332 } 333 if (max > ret) ret = max; 334 } 335 return ret; 336 337 case CLOSURE: 338 case NONGREEDYCLOSURE: 339 if (this.getMax() >= 0) 340 return this.getMax() * this.getChild(0).getMaxLength(); 343 return -1; 344 345 case EMPTY: 346 case ANCHOR: 347 return 0; 348 349 case CHAR: 350 return 1; 351 case DOT: 352 case RANGE: 353 case NRANGE: 354 return 2; 355 356 case INDEPENDENT: 357 case PAREN: 358 case MODIFIERGROUP: 359 return this.getChild(0).getMaxLength(); 360 361 case BACKREFERENCE: 362 return -1; 364 case STRING: 365 return this.getString().length(); 366 367 case LOOKAHEAD: 368 case NEGATIVELOOKAHEAD: 369 case LOOKBEHIND: 370 case NEGATIVELOOKBEHIND: 371 return 0; 373 default: 374 throw new RuntimeException ("Token#getMaxLength(): Invalid Type: "+this.type); 375 } 376 } 377 378 static final int FC_CONTINUE = 0; 379 static final int FC_TERMINAL = 1; 380 static final int FC_ANY = 2; 381 private static final boolean isSet(int options, int flag) { 382 return (options & flag) == flag; 383 } 384 final int analyzeFirstCharacter(RangeToken result, int options) { 385 switch (this.type) { 386 case CONCAT: 387 int ret = FC_CONTINUE; 388 for (int i = 0; i < this.size(); i ++) 389 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) 390 break; 391 return ret; 392 393 case UNION: 394 if (this.size() == 0) 395 return FC_CONTINUE; 396 401 int ret2 = FC_CONTINUE; 402 boolean hasEmpty = false; 403 for (int i = 0; i < this.size(); i ++) { 404 ret2 = this.getChild(i).analyzeFirstCharacter(result, options); 405 if (ret2 == FC_ANY) 406 break; 407 else if (ret2 == FC_CONTINUE) 408 hasEmpty = true; 409 } 410 return hasEmpty ? FC_CONTINUE : ret2; 411 412 case CONDITION: 413 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options); 414 if (this.size() == 1) return FC_CONTINUE; 415 if (ret3 == FC_ANY) return ret3; 416 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options); 417 if (ret4 == FC_ANY) return ret4; 418 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL; 419 420 case CLOSURE: 421 case NONGREEDYCLOSURE: 422 this.getChild(0).analyzeFirstCharacter(result, options); 423 return FC_CONTINUE; 424 425 case EMPTY: 426 case ANCHOR: 427 return FC_CONTINUE; 428 429 case CHAR: 430 int ch = this.getChar(); 431 result.addRange(ch, ch); 432 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 433 ch = Character.toUpperCase((char)ch); 434 result.addRange(ch, ch); 435 ch = Character.toLowerCase((char)ch); 436 result.addRange(ch, ch); 437 } 438 return FC_TERMINAL; 439 440 case DOT: if (isSet(options, RegularExpression.SINGLE_LINE)) { 442 return FC_CONTINUE; } else { 444 return FC_CONTINUE; 445 453 } 454 455 case RANGE: 456 if (isSet(options, RegularExpression.IGNORE_CASE)) { 457 result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken()); 458 } else { 459 result.mergeRanges(this); 460 } 461 return FC_TERMINAL; 462 463 case NRANGE: if (isSet(options, RegularExpression.IGNORE_CASE)) { 465 result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken())); 466 } else { 467 result.mergeRanges(Token.complementRanges(this)); 468 } 469 return FC_TERMINAL; 470 471 case INDEPENDENT: 472 case PAREN: 473 return this.getChild(0).analyzeFirstCharacter(result, options); 474 475 case MODIFIERGROUP: 476 options |= ((ModifierToken)this).getOptions(); 477 options &= ~((ModifierToken)this).getOptionsMask(); 478 return this.getChild(0).analyzeFirstCharacter(result, options); 479 480 case BACKREFERENCE: 481 result.addRange(0, UTF16_MAX); return FC_ANY; 483 484 case STRING: 485 int cha = this.getString().charAt(0); 486 int ch2; 487 if (REUtil.isHighSurrogate(cha) 488 && this.getString().length() >= 2 489 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) 490 cha = REUtil.composeFromSurrogates(cha, ch2); 491 result.addRange(cha, cha); 492 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 493 cha = Character.toUpperCase((char)cha); 494 result.addRange(cha, cha); 495 cha = Character.toLowerCase((char)cha); 496 result.addRange(cha, cha); 497 } 498 return FC_TERMINAL; 499 500 case LOOKAHEAD: 501 case NEGATIVELOOKAHEAD: 502 case LOOKBEHIND: 503 case NEGATIVELOOKBEHIND: 504 return FC_CONTINUE; 505 506 default: 507 throw new RuntimeException ("Token#analyzeHeadCharacter(): Invalid Type: "+this.type); 508 } 509 } 510 511 private final boolean isShorterThan(Token tok) { 512 if (tok == null) return false; 513 523 int mylength; 524 if (this.type == STRING) mylength = this.getString().length(); 525 else throw new RuntimeException ("Internal Error: Illegal type: "+this.type); 526 int otherlength; 527 if (tok.type == STRING) otherlength = tok.getString().length(); 528 else throw new RuntimeException ("Internal Error: Illegal type: "+tok.type); 529 return mylength < otherlength; 530 } 531 532 static class FixedStringContainer { 533 Token token = null; 534 int options = 0; 535 FixedStringContainer() { 536 } 537 } 538 539 final void findFixedString(FixedStringContainer container, int options) { 540 switch (this.type) { 541 case CONCAT: 542 Token prevToken = null; 543 int prevOptions = 0; 544 for (int i = 0; i < this.size(); i ++) { 545 this.getChild(i).findFixedString(container, options); 546 if (prevToken == null || prevToken.isShorterThan(container.token)) { 547 prevToken = container.token; 548 prevOptions = container.options; 549 } 550 } 551 container.token = prevToken; 552 container.options = prevOptions; 553 return; 554 555 case UNION: 556 case CLOSURE: 557 case NONGREEDYCLOSURE: 558 case EMPTY: 559 case ANCHOR: 560 case RANGE: 561 case DOT: 562 case NRANGE: 563 case BACKREFERENCE: 564 case LOOKAHEAD: 565 case NEGATIVELOOKAHEAD: 566 case LOOKBEHIND: 567 case NEGATIVELOOKBEHIND: 568 case CONDITION: 569 container.token = null; 570 return; 571 572 case CHAR: container.token = null; return; 576 case STRING: 577 container.token = this; 578 container.options = options; 579 return; 580 581 case INDEPENDENT: 582 case PAREN: 583 this.getChild(0).findFixedString(container, options); 584 return; 585 586 case MODIFIERGROUP: 587 options |= ((ModifierToken)this).getOptions(); 588 options &= ~((ModifierToken)this).getOptionsMask(); 589 this.getChild(0).findFixedString(container, options); 590 return; 591 592 default: 593 throw new RuntimeException ("Token#findFixedString(): Invalid Type: "+this.type); 594 } 595 } 596 597 boolean match(int ch) { 598 throw new RuntimeException ("NFAArrow#match(): Internal error: "+this.type); 599 } 600 601 private final static Hashtable categories = new Hashtable (); 603 private final static Hashtable categories2 = new Hashtable (); 604 private static final String [] categoryNames = { 605 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", 606 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", 607 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", "Pi", "Pf", "L", "M", "N", "Z", "C", "P", "S", }; 611 612 static final int CHAR_INIT_QUOTE = 29; static final int CHAR_FINAL_QUOTE = 30; static final int CHAR_LETTER = 31; 616 static final int CHAR_MARK = 32; 617 static final int CHAR_NUMBER = 33; 618 static final int CHAR_SEPARATOR = 34; 619 static final int CHAR_OTHER = 35; 620 static final int CHAR_PUNCTUATION = 36; 621 static final int CHAR_SYMBOL = 37; 622 623 private static final String [] blockNames = { 625 "Basic Latin", 626 "Latin-1 Supplement", 627 "Latin Extended-A", 628 "Latin Extended-B", 629 "IPA Extensions", 630 "Spacing Modifier Letters", 631 "Combining Diacritical Marks", 632 "Greek", 633 "Cyrillic", 634 "Armenian", 635 "Hebrew", 636 "Arabic", 637 "Syriac", 638 "Thaana", 639 "Devanagari", 640 "Bengali", 641 "Gurmukhi", 642 "Gujarati", 643 "Oriya", 644 "Tamil", 645 "Telugu", 646 "Kannada", 647 "Malayalam", 648 "Sinhala", 649 "Thai", 650 "Lao", 651 "Tibetan", 652 "Myanmar", 653 "Georgian", 654 "Hangul Jamo", 655 "Ethiopic", 656 "Cherokee", 657 "Unified Canadian Aboriginal Syllabics", 658 "Ogham", 659 "Runic", 660 "Khmer", 661 "Mongolian", 662 "Latin Extended Additional", 663 "Greek Extended", 664 "General Punctuation", 665 "Superscripts and Subscripts", 666 "Currency Symbols", 667 "Combining Marks for Symbols", 668 "Letterlike Symbols", 669 "Number Forms", 670 "Arrows", 671 "Mathematical Operators", 672 "Miscellaneous Technical", 673 "Control Pictures", 674 "Optical Character Recognition", 675 "Enclosed Alphanumerics", 676 "Box Drawing", 677 "Block Elements", 678 "Geometric Shapes", 679 "Miscellaneous Symbols", 680 "Dingbats", 681 "Braille Patterns", 682 "CJK Radicals Supplement", 683 "Kangxi Radicals", 684 "Ideographic Description Characters", 685 "CJK Symbols and Punctuation", 686 "Hiragana", 687 "Katakana", 688 "Bopomofo", 689 "Hangul Compatibility Jamo", 690 "Kanbun", 691 "Bopomofo Extended", 692 "Enclosed CJK Letters and Months", 693 "CJK Compatibility", 694 "CJK Unified Ideographs Extension A", 695 "CJK Unified Ideographs", 696 "Yi Syllables", 697 "Yi Radicals", 698 "Hangul Syllables", 699 "Private Use", 700 "CJK Compatibility Ideographs", 701 "Alphabetic Presentation Forms", 702 "Arabic Presentation Forms-A", 703 "Combining Half Marks", 704 "CJK Compatibility Forms", 705 "Small Form Variants", 706 "Arabic Presentation Forms-B", 707 "Specials", 708 "Halfwidth and Fullwidth Forms", 709 "Old Italic", "Gothic", 712 "Deseret", 713 "Byzantine Musical Symbols", 714 "Musical Symbols", 715 "Mathematical Alphanumeric Symbols", 716 "CJK Unified Ideographs Extension B", 717 "CJK Compatibility Ideographs Supplement", 718 "Tags", 719 721 }; 722 static final String blockRanges = 727 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F" 728 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF" 729 <
|