1 57 58 package com.sun.org.apache.xerces.internal.impl.xpath.regex; 59 60 import java.util.Locale ; 61 import java.util.MissingResourceException ; 62 import java.util.ResourceBundle ; 63 import java.util.Vector ; 64 65 70 class RegexParser { 71 static final int T_CHAR = 0; 72 static final int T_EOF = 1; 73 static final int T_OR = 2; static final int T_STAR = 3; static final int T_PLUS = 4; static final int T_QUESTION = 5; static final int T_LPAREN = 6; static final int T_RPAREN = 7; static final int T_DOT = 8; static final int T_LBRACKET = 9; static final int T_BACKSOLIDUS = 10; static final int T_CARET = 11; static final int T_DOLLAR = 12; static final int T_LPAREN2 = 13; static final int T_LOOKAHEAD = 14; static final int T_NEGATIVELOOKAHEAD = 15; static final int T_LOOKBEHIND = 16; static final int T_NEGATIVELOOKBEHIND = 17; static final int T_INDEPENDENT = 18; static final int T_SET_OPERATIONS = 19; static final int T_POSIX_CHARCLASS_START = 20; static final int T_COMMENT = 21; static final int T_MODIFIERS = 22; static final int T_CONDITION = 23; static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; 97 static class ReferencePosition { 98 int refNumber; 99 int position; 100 ReferencePosition(int n, int pos) { 101 this.refNumber = n; 102 this.position = pos; 103 } 104 } 105 106 int offset; 107 String regex; 108 int regexlen; 109 int options; 110 ResourceBundle resources; 111 int chardata; 112 int nexttoken; 113 static protected final int S_NORMAL = 0; 114 static protected final int S_INBRACKETS = 1; 115 static protected final int S_INXBRACKETS = 2; 116 int context = S_NORMAL; 117 int parennumber = 1; 118 boolean hasBackReferences; 119 Vector references = null; 120 121 public RegexParser() { 122 this.setLocale(Locale.getDefault()); 123 } 124 public RegexParser(Locale locale) { 125 this.setLocale(locale); 126 } 127 128 public void setLocale(Locale locale) { 129 try { 130 this.resources = ResourceBundle.getBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale); 131 } catch (MissingResourceException mre) { 132 throw new RuntimeException ("Installation Problem??? Couldn't load messages: " 133 +mre.getMessage()); 134 } 135 } 136 137 final ParseException ex(String key, int loc) { 138 return new ParseException(this.resources.getString(key), loc); 139 } 140 141 private final boolean isSet(int flag) { 142 return (this.options & flag) == flag; 143 } 144 145 synchronized Token parse(String regex, int options) throws ParseException { 146 this.options = options; 147 this.offset = 0; 148 this.setContext(S_NORMAL); 149 this.parennumber = 1; 150 this.hasBackReferences = false; 151 this.regex = regex; 152 if (this.isSet(RegularExpression.EXTENDED_COMMENT)) 153 this.regex = REUtil.stripExtendedComment(this.regex); 154 this.regexlen = this.regex.length(); 155 156 157 this.next(); 158 Token ret = this.parseRegex(); 159 if (this.offset != this.regexlen) 160 throw ex("parser.parse.1", this.offset); 161 if (this.references != null) { 162 for (int i = 0; i < this.references.size(); i ++) { 163 ReferencePosition position = (ReferencePosition)this.references.elementAt(i); 164 if (this.parennumber <= position.refNumber) 165 throw ex("parser.parse.2", position.position); 166 } 167 this.references.removeAllElements(); 168 } 169 return ret; 170 } 171 172 178 179 protected final void setContext(int con) { 180 this.context = con; 181 } 182 183 final int read() { 184 return this.nexttoken; 185 } 186 187 final void next() { 188 if (this.offset >= this.regexlen) { 189 this.chardata = -1; 190 this.nexttoken = T_EOF; 191 return; 192 } 193 194 int ret; 195 int ch = this.regex.charAt(this.offset++); 196 this.chardata = ch; 197 198 if (this.context == S_INBRACKETS) { 199 switch (ch) { 202 case '\\': 203 ret = T_BACKSOLIDUS; 204 if (this.offset >= this.regexlen) 205 throw ex("parser.next.1", this.offset-1); 206 this.chardata = this.regex.charAt(this.offset++); 207 break; 208 209 case '-': 210 if (this.isSet(RegularExpression.XMLSCHEMA_MODE) 211 && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { 212 this.offset++; 213 ret = T_XMLSCHEMA_CC_SUBTRACTION; 214 } else 215 ret = T_CHAR; 216 break; 217 218 case '[': 219 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) 220 && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { 221 this.offset++; 222 ret = T_POSIX_CHARCLASS_START; 223 break; 224 } default: 226 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { 227 int low = this.regex.charAt(this.offset); 228 if (REUtil.isLowSurrogate(low)) { 229 this.chardata = REUtil.composeFromSurrogates(ch, low); 230 this.offset ++; 231 } 232 } 233 ret = T_CHAR; 234 } 235 this.nexttoken = ret; 236 return; 237 } 238 239 switch (ch) { 240 case '|': ret = T_OR; break; 241 case '*': ret = T_STAR; break; 242 case '+': ret = T_PLUS; break; 243 case '?': ret = T_QUESTION; break; 244 case ')': ret = T_RPAREN; break; 245 case '.': ret = T_DOT; break; 246 case '[': ret = T_LBRACKET; break; 247 case '^': ret = T_CARET; break; 248 case '$': ret = T_DOLLAR; break; 249 case '(': 250 ret = T_LPAREN; 251 if (this.offset >= this.regexlen) 252 break; 253 if (this.regex.charAt(this.offset) != '?') 254 break; 255 if (++this.offset >= this.regexlen) 256 throw ex("parser.next.2", this.offset-1); 257 ch = this.regex.charAt(this.offset++); 258 switch (ch) { 259 case ':': ret = T_LPAREN2; break; 260 case '=': ret = T_LOOKAHEAD; break; 261 case '!': ret = T_NEGATIVELOOKAHEAD; break; 262 case '[': ret = T_SET_OPERATIONS; break; 263 case '>': ret = T_INDEPENDENT; break; 264 case '<': 265 if (this.offset >= this.regexlen) 266 throw ex("parser.next.2", this.offset-3); 267 ch = this.regex.charAt(this.offset++); 268 if (ch == '=') { 269 ret = T_LOOKBEHIND; 270 } else if (ch == '!') { 271 ret = T_NEGATIVELOOKBEHIND; 272 } else 273 throw ex("parser.next.3", this.offset-3); 274 break; 275 case '#': 276 while (this.offset < this.regexlen) { 277 ch = this.regex.charAt(this.offset++); 278 if (ch == ')') break; 279 } 280 if (ch != ')') 281 throw ex("parser.next.4", this.offset-1); 282 ret = T_COMMENT; 283 break; 284 default: 285 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') { this.offset --; 287 ret = T_MODIFIERS; 288 break; 289 } else if (ch == '(') { ret = T_CONDITION; break; 292 } 293 throw ex("parser.next.2", this.offset-2); 294 } 295 break; 296 297 case '\\': 298 ret = T_BACKSOLIDUS; 299 if (this.offset >= this.regexlen) 300 throw ex("parser.next.1", this.offset-1); 301 this.chardata = this.regex.charAt(this.offset++); 302 break; 303 304 default: 305 ret = T_CHAR; 306 } 307 this.nexttoken = ret; 308 } 309 310 319 Token parseRegex() throws ParseException { 320 Token tok = this.parseTerm(); 321 Token parent = null; 322 while (this.read() == T_OR) { 323 this.next(); if (parent == null) { 325 parent = Token.createUnion(); 326 parent.addChild(tok); 327 tok = parent; 328 } 329 tok.addChild(this.parseTerm()); 330 } 331 return tok; 332 } 333 334 337 Token parseTerm() throws ParseException { 338 int ch = this.read(); 339 if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { 340 return Token.createEmpty(); 341 } else { 342 Token tok = this.parseFactor(); 343 Token concat = null; 344 while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { 345 if (concat == null) { 346 concat = Token.createConcat(); 347 concat.addChild(tok); 348 tok = concat; 349 } 350 concat.addChild(this.parseFactor()); 351 } 353 return tok; 354 } 355 } 356 357 359 Token processCaret() throws ParseException { 360 this.next(); 361 return Token.token_linebeginning; 362 } 363 Token processDollar() throws ParseException { 364 this.next(); 365 return Token.token_lineend; 366 } 367 Token processLookahead() throws ParseException { 368 this.next(); 369 Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); 370 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 371 this.next(); return tok; 373 } 374 Token processNegativelookahead() throws ParseException { 375 this.next(); 376 Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); 377 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 378 this.next(); return tok; 380 } 381 Token processLookbehind() throws ParseException { 382 this.next(); 383 Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); 384 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 385 this.next(); return tok; 387 } 388 Token processNegativelookbehind() throws ParseException { 389 this.next(); 390 Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); 391 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 392 this.next(); return tok; 394 } 395 Token processBacksolidus_A() throws ParseException { 396 this.next(); 397 return Token.token_stringbeginning; 398 } 399 Token processBacksolidus_Z() throws ParseException { 400 this.next(); 401 return Token.token_stringend2; 402 } 403 Token processBacksolidus_z() throws ParseException { 404 this.next(); 405 return Token.token_stringend; 406 } 407 Token processBacksolidus_b() throws ParseException { 408 this.next(); 409 return Token.token_wordedge; 410 } 411 Token processBacksolidus_B() throws ParseException { 412 this.next(); 413 return Token.token_not_wordedge; 414 } 415 Token processBacksolidus_lt() throws ParseException { 416 this.next(); 417 return Token.token_wordbeginning; 418 } 419 Token processBacksolidus_gt() throws ParseException { 420 this.next(); 421 return Token.token_wordend; 422 } 423 Token processStar(Token tok) throws ParseException { 424 this.next(); 425 if (this.read() == T_QUESTION) { 426 this.next(); 427 return Token.createNGClosure(tok); 428 } else 429 return Token.createClosure(tok); 430 } 431 Token processPlus(Token tok) throws ParseException { 432 this.next(); 434 if (this.read() == T_QUESTION) { 435 this.next(); 436 return Token.createConcat(tok, Token.createNGClosure(tok)); 437 } else 438 return Token.createConcat(tok, Token.createClosure(tok)); 439 } 440 Token processQuestion(Token tok) throws ParseException { 441 this.next(); 443 Token par = Token.createUnion(); 444 if (this.read() == T_QUESTION) { 445 this.next(); 446 par.addChild(Token.createEmpty()); 447 par.addChild(tok); 448 } else { 449 par.addChild(tok); 450 par.addChild(Token.createEmpty()); 451 } 452 return par; 453 } 454 boolean checkQuestion(int off) { 455 return off < this.regexlen && this.regex.charAt(off) == '?'; 456 } 457 Token processParen() throws ParseException { 458 this.next(); 459 int p = this.parennumber++; 460 Token tok = Token.createParen(this.parseRegex(), p); 461 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 462 this.next(); return tok; 464 } 465 Token processParen2() throws ParseException { 466 this.next(); 467 Token tok = Token.createParen(this.parseRegex(), 0); 468 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 469 this.next(); return tok; 471 } 472 Token processCondition() throws ParseException { 473 if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); 475 int refno = -1; 477 Token condition = null; 478 int ch = this.regex.charAt(this.offset); 479 if ('1' <= ch && ch <= '9') { 480 refno = ch-'0'; 481 this.hasBackReferences = true; 482 if (this.references == null) this.references = new Vector (); 483 this.references.addElement(new ReferencePosition(refno, this.offset)); 484 this.offset ++; 485 if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); 486 this.offset ++; 487 } else { 488 if (ch == '?') this.offset --; this.next(); 490 condition = this.parseFactor(); 491 switch (condition.type) { 492 case Token.LOOKAHEAD: 493 case Token.NEGATIVELOOKAHEAD: 494 case Token.LOOKBEHIND: 495 case Token.NEGATIVELOOKBEHIND: 496 break; 497 case Token.ANCHOR: 498 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 499 break; 500 default: 501 throw ex("parser.factor.5", this.offset); 502 } 503 } 504 this.next(); 506 Token yesPattern = this.parseRegex(); 507 Token noPattern = null; 508 if (yesPattern.type == Token.UNION) { 509 if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); 510 noPattern = yesPattern.getChild(1); 511 yesPattern = yesPattern.getChild(0); 512 } 513 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 514 this.next(); 515 return Token.createCondition(refno, condition, yesPattern, noPattern); 516 } 517 Token processModifiers() throws ParseException { 518 int add = 0, mask = 0, ch = -1; 521 while (this.offset < this.regexlen) { 522 ch = this.regex.charAt(this.offset); 523 int v = REUtil.getOptionValue(ch); 524 if (v == 0) break; add |= v; 526 this.offset ++; 527 } 528 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 529 if (ch == '-') { 530 this.offset ++; 531 while (this.offset < this.regexlen) { 532 ch = this.regex.charAt(this.offset); 533 int v = REUtil.getOptionValue(ch); 534 if (v == 0) break; mask |= v; 536 this.offset ++; 537 } 538 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 539 } 540 Token tok; 541 if (ch == ':') { 542 this.offset ++; 543 this.next(); 544 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 545 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 546 this.next(); 547 } else if (ch == ')') { this.offset ++; 549 this.next(); 550 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 551 } else 552 throw ex("parser.factor.3", this.offset); 553 554 return tok; 555 } 556 Token processIndependent() throws ParseException { 557 this.next(); 558 Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); 559 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 560 this.next(); return tok; 562 } 563 Token processBacksolidus_c() throws ParseException { 564 int ch2; if (this.offset >= this.regexlen 566 || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) 567 throw ex("parser.atom.1", this.offset-1); 568 this.next(); 569 return Token.createChar(ch2-0x40); 570 } 571 Token processBacksolidus_C() throws ParseException { 572 throw ex("parser.process.1", this.offset); 573 } 574 Token processBacksolidus_i() throws ParseException { 575 Token tok = Token.createChar('i'); 576 this.next(); 577 return tok; 578 } 579 Token processBacksolidus_I() throws ParseException { 580 throw ex("parser.process.1", this.offset); 581 } 582 Token processBacksolidus_g() throws ParseException { 583 this.next(); 584 return Token.getGraphemePattern(); 585 } 586 Token processBacksolidus_X() throws ParseException { 587 this.next(); 588 return Token.getCombiningCharacterSequence(); 589 } 590 Token processBackreference() throws ParseException { 591 int refnum = this.chardata-'0'; 592 Token tok = Token.createBackReference(refnum); 593 this.hasBackReferences = true; 594 if (this.references == null) this.references = new Vector (); 595 this.references.addElement(new ReferencePosition(refnum, this.offset-2)); 596 this.next(); 597 return tok; 598 } 599 600 602 611 Token parseFactor() throws ParseException { 612 int ch = this.read(); 613 Token tok; 614 switch (ch) { 615 case T_CARET: return this.processCaret(); 616 case T_DOLLAR: return this.processDollar(); 617 case T_LOOKAHEAD: return this.processLookahead(); 618 case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); 619 case T_LOOKBEHIND: return this.processLookbehind(); 620 case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); 621 622 case T_COMMENT: 623 this.next(); 624 return Token.createEmpty(); 625 626 case T_BACKSOLIDUS: 627 switch (this.chardata) { 628 case 'A': return this.processBacksolidus_A(); 629 case 'Z': return this.processBacksolidus_Z(); 630 case 'z': return this.processBacksolidus_z(); 631 case 'b': return this.processBacksolidus_b(); 632 case 'B': return this.processBacksolidus_B(); 633 case '<': return this.processBacksolidus_lt(); 634 case '>': return this.processBacksolidus_gt(); 635 } 636 } 638 tok = this.parseAtom(); 639 ch = this.read(); 640 switch (ch) { 641 case T_STAR: return this.processStar(tok); 642 case T_PLUS: return this.processPlus(tok); 643 case T_QUESTION: return this.processQuestion(tok); 644 case T_CHAR: 645 if (this.chardata == '{' && this.offset < this.regexlen) { 646 647 int off = this.offset; int min = 0, max = -1; 649 650 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 651 652 min = ch -'0'; 653 while (off < this.regexlen 654 && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 655 min = min*10 +ch-'0'; 656 if (min < 0) 657 throw ex("parser.quantifier.5", this.offset); 658 } 659 } 660 else { 661 throw ex("parser.quantifier.1", this.offset); 662 } 663 664 max = min; 665 if (ch == ',') { 666 667 if (off >= this.regexlen) { 668 throw ex("parser.quantifier.3", this.offset); 669 } 670 else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 671 672 max = ch -'0'; while (off < this.regexlen 674 && (ch = this.regex.charAt(off++)) >= '0' 675 && ch <= '9') { 676 max = max*10 +ch-'0'; 677 if (max < 0) 678 throw ex("parser.quantifier.5", this.offset); 679 } 680 681 if (min > max) 682 throw ex("parser.quantifier.4", this.offset); 683 } 684 else { max = -1; 686 } 687 } 688 689 if (ch != '}') 690 throw ex("parser.quantifier.2", this.offset); 691 692 if (this.checkQuestion(off)) { tok = Token.createNGClosure(tok); 694 this.offset = off+1; 695 } else { 696 tok = Token.createClosure(tok); 697 this.offset = off; 698 } 699 700 tok.setMin(min); 701 tok.setMax(max); 702 this.next(); 704 } 705 } 706 return tok; 707 } 708 709 715 Token parseAtom() throws ParseException { 716 int ch = this.read(); 717 Token tok = null; 718 switch (ch) { 719 case T_LPAREN: return this.processParen(); 720 case T_LPAREN2: return this.processParen2(); case T_CONDITION: return this.processCondition(); case T_MODIFIERS: return this.processModifiers(); case T_INDEPENDENT: return this.processIndependent(); 724 case T_DOT: 725 this.next(); tok = Token.token_dot; 727 break; 728 729 736 case T_LBRACKET: return this.parseCharacterClass(true); 737 case T_SET_OPERATIONS: return this.parseSetOperations(); 738 739 case T_BACKSOLIDUS: 740 switch (this.chardata) { 741 case 'd': case 'D': 742 case 'w': case 'W': 743 case 's': case 'S': 744 tok = this.getTokenForShorthand(this.chardata); 745 this.next(); 746 return tok; 747 748 case 'e': case 'f': case 'n': case 'r': 749 case 't': case 'u': case 'v': case 'x': 750 { 751 int ch2 = this.decodeEscaped(); 752 if (ch2 < 0x10000) { 753 tok = Token.createChar(ch2); 754 } else { 755 tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); 756 } 757 } 758 break; 759 760 case 'c': return this.processBacksolidus_c(); 761 case 'C': return this.processBacksolidus_C(); 762 case 'i': return this.processBacksolidus_i(); 763 case 'I': return this.processBacksolidus_I(); 764 case 'g': return this.processBacksolidus_g(); 765 case 'X': return this.processBacksolidus_X(); 766 case '1': case '2': case '3': case '4': 767 case '5': case '6': case '7': case '8': case '9': 768 return this.processBackreference(); 769 770 case 'P': 771 case 'p': 772 int pstart = this.offset; 773 tok = processBacksolidus_pP(this.chardata); 774 if (tok == null) throw this.ex("parser.atom.5", pstart); 775 break; 776 777 default: 778 tok = Token.createChar(this.chardata); 779 } 780 this.next(); 781 break; 782 783 case T_CHAR: 784 if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}') 785 throw this.ex("parser.atom.4", this.offset-1); 786 tok = Token.createChar(this.chardata); 787 int high = this.chardata; 788 this.next(); 789 if (REUtil.isHighSurrogate(high) 790 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { 791 char[] sur = new char[2]; 792 sur[0] = (char)high; 793 sur[1] = (char)this.chardata; 794 tok = Token.createParen(Token.createString(new String (sur)), 0); 795 this.next(); 796 } 797 break; 798 799 default: 800 throw this.ex("parser.atom.4", this.offset-1); 801 } 802 return tok; 803 } 804 805 protected RangeToken processBacksolidus_pP(int c) throws ParseException { 806 807 this.next(); 808 if (this.read() != T_CHAR || this.chardata != '{') 809 throw this.ex("parser.atom.2", this.offset-1); 810 811 boolean positive = c == 'p'; 813 int namestart = this.offset; 814 int nameend = this.regex.indexOf('}', namestart); 815 816 if (nameend < 0) 817 throw this.ex("parser.atom.3", this.offset); 818 819 String pname = this.regex.substring(namestart, nameend); 820 this.offset = nameend+1; 821 822 return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE)); 823 } 824 825 int processCIinCharacterClass(RangeToken tok, int c) { 826 return this.decodeEscaped(); 827 } 828 829 836 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException { 837 this.setContext(S_INBRACKETS); 838 this.next(); boolean nrange = false; 840 RangeToken base = null; 841 RangeToken tok; 842 if (this.read() == T_CHAR && this.chardata == '^') { 843 nrange = true; 844 this.next(); if (useNrange) { 846 tok = Token.createNRange(); 847 } else { 848 base = Token.createRange(); 849 base.addRange(0, Token.UTF16_MAX); 850 tok = Token.createRange(); 851 } 852 } else { 853 tok = Token.createRange(); 854 } 855 int type; 856 boolean firstloop = true; 857 while ((type = this.read()) != T_EOF) { 858 if (type == T_CHAR && this.chardata == ']' && !firstloop) 859 break; 860 firstloop = false; 861 int c = this.chardata; 862 boolean end = false; 863 if (type == T_BACKSOLIDUS) { 864 switch (c) { 865 case 'd': case 'D': 866 case 'w': case 'W': 867 case 's': case 'S': 868 tok.mergeRanges(this.getTokenForShorthand(c)); 869 end = true; 870 break; 871 872 case 'i': case 'I': 873 case 'c': case 'C': 874 c = this.processCIinCharacterClass(tok, c); 875 if (c < 0) end = true; 876 break; 877 878 case 'p': 879 case 'P': 880 int pstart = this.offset; 881 RangeToken tok2 = this.processBacksolidus_pP(c); 882 if (tok2 == null) throw this.ex("parser.atom.5", pstart); 883 tok.mergeRanges(tok2); 884 end = true; 885 break; 886 887 default: 888 c = this.decodeEscaped(); 889 } } else if (type == T_POSIX_CHARCLASS_START) { 893 int nameend = this.regex.indexOf(':', this.offset); 894 if (nameend < 0) throw this.ex("parser.cc.1", this.offset); 895 boolean positive = true; 896 if (this.regex.charAt(this.offset) == '^') { 897 this.offset ++; 898 positive = false; 899 } 900 String name = this.regex.substring(this.offset, nameend); 901 RangeToken range = Token.getRange(name, positive, 902 this.isSet(RegularExpression.XMLSCHEMA_MODE)); 903 if (range == null) throw this.ex("parser.cc.3", this.offset); 904 tok.mergeRanges(range); 905 end = true; 906 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') 907 throw this.ex("parser.cc.1", nameend); 908 this.offset = nameend+2; 909 } 910 this.next(); 911 if (!end) { if (this.read() != T_CHAR || this.chardata != '-') { tok.addRange(c, c); 914 } else { 915 this.next(); if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); 917 if (type == T_CHAR && this.chardata == ']') { 918 tok.addRange(c, c); 919 tok.addRange('-', '-'); 920 } else { 921 int rangeend = this.chardata; 922 if (type == T_BACKSOLIDUS) 923 rangeend = this.decodeEscaped(); 924 this.next(); 925 tok.addRange(c, rangeend); 926 } 927 } 928 } 929 if (this.isSet(RegularExpression.SPECIAL_COMMA) 930 && this.read() == T_CHAR && this.chardata == ',') 931 this.next(); 932 } 933 if (this.read() == T_EOF) 934 throw this.ex("parser.cc.2", this.offset); 935 if (!useNrange && nrange) { 936 base.subtractRanges(tok); 937 tok = base; 938 } 939 tok.sortRanges(); 940 tok.compactRanges(); 941 946 this.setContext(S_NORMAL); 947 this.next(); 949 return tok; 950 } 951 952 955 protected RangeToken parseSetOperations() throws ParseException { 956 RangeToken tok = this.parseCharacterClass(false); 957 int type; 958 while ((type = this.read()) != T_RPAREN) { 959 int ch = this.chardata; 960 if (type == T_CHAR && (ch == '-' || ch == '&') 961 || type == T_PLUS) { 962 this.next(); 963 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); 964 RangeToken t2 = this.parseCharacterClass(false); 965 if (type == T_PLUS) 966 tok.mergeRanges(t2); 967 else if (ch == '-') 968 tok.subtractRanges(t2); 969 else if (ch == '&') 970 tok.intersectRanges(t2); 971 else 972 throw new RuntimeException ("ASSERT"); 973 } else { 974 throw ex("parser.ope.2", this.offset-1); 975 } 976 } 977 this.next(); 978 return tok; 979 } 980 981 Token getTokenForShorthand(int ch) { 982 Token tok; 983 switch (ch) { 984 case 'd': 985 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 986 ? Token.getRange("Nd", true) : Token.token_0to9; 987 break; 988 case 'D': 989 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 990 ? Token.getRange("Nd", false) : Token.token_not_0to9; 991 break; 992 case 'w': 993 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 994 ? Token.getRange("IsWord", true) : Token.token_wordchars; 995 break; 996 case 'W': 997 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 998 ? Token.getRange("IsWord", false) : Token.token_not_wordchars; 999 break; 1000 case 's': 1001 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1002 ? Token.getRange("IsSpace", true) : Token.token_spaces; 1003 break; 1004 case 'S': 1005 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1006 ? Token.getRange("IsSpace", false) : Token.token_not_spaces; 1007 break; 1008 1009 default: 1010 throw new RuntimeException ("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); 1011 } 1012 return tok; 1013 } 1014 1015 1017 int decodeEscaped() throws ParseException { 1018 if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); 1019 int c = this.chardata; 1020 switch (c) { 1021 case 'e': c = 0x1b; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'x': 1028 this.next(); 1029 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 1030 if (this.chardata == '{') { 1031 int v1 = 0; 1032 int uv = 0; 1033 do { 1034 this.next(); 1035 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 1036 if ((v1 = hexChar(this.chardata)) < 0) 1037 break; 1038 if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); 1039 uv = uv*16+v1; 1040 } while (true); 1041 if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); 1042 if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); 1043 c = uv; 1044 } else { 1045 int v1 = 0; 1046 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1047 throw ex("parser.descape.1", this.offset-1); 1048 int uv = v1; 1049 this.next(); 1050 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1051 throw ex("parser.descape.1", this.offset-1); 1052 uv = uv*16+v1; 1053 c = uv; 1054 } 1055 break; 1056 1057 case 'u': 1058 int v1 = 0; 1059 this.next(); 1060 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1061 throw ex("parser.descape.1", this.offset-1); 1062 int uv = v1; 1063 this.next(); 1064 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1065 throw ex("parser.descape.1", this.offset-1); 1066 uv = uv*16+v1; 1067 this.next(); 1068 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1069 throw ex("parser.descape.1", this.offset-1); 1070 uv = uv*16+v1; 1071 this.next(); 1072 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1073 throw ex("parser.descape.1", this.offset-1); 1074 uv = uv*16+v1; 1075 c = uv; 1076 break; 1077 1078 case 'v': 1079 this.next(); 1080 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1081 throw ex("parser.descape.1", this.offset-1); 1082 uv = v1; 1083 this.next(); 1084 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1085 throw ex("parser.descape.1", this.offset-1); 1086 uv = uv*16+v1; 1087 this.next(); 1088 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1089 throw ex("parser.descape.1", this.offset-1); 1090 uv = uv*16+v1; 1091 this.next(); 1092 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1093 throw ex("parser.descape.1", this.offset-1); 1094 uv = uv*16+v1; 1095 this.next(); 1096 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1097 throw ex("parser.descape.1", this.offset-1); 1098 uv = uv*16+v1; 1099 this.next(); 1100 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1101 throw ex("parser.descape.1", this.offset-1); 1102 uv = uv*16+v1; 1103 if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); 1104 c = uv; 1105 break; 1106 case 'A': 1107 case 'Z': 1108 case 'z': 1109 throw ex("parser.descape.5", this.offset-2); 1110 default: 1111 } 1112 return c; 1113 } 1114 1115 static private final int hexChar(int ch) { 1116 if (ch < '0') return -1; 1117 if (ch > 'f') return -1; 1118 if (ch <= '9') return ch-'0'; 1119 if (ch < 'A') return -1; 1120 if (ch <= 'F') return ch-'A'+10; 1121 if (ch < 'a') return -1; 1122 return ch-'a'+10; 1123 } 1124} 1125 | Popular Tags |