1 16 17 package org.apache.xerces.impl.xpath.regex; 18 19 import java.util.Locale ; 20 import java.util.MissingResourceException ; 21 import java.util.ResourceBundle ; 22 import java.util.Vector ; 23 24 31 class RegexParser { 32 static final int T_CHAR = 0; 33 static final int T_EOF = 1; 34 static final int T_OR = 2; static final int T_STAR = 3; static final int T_PLUS = 4; static final int T_QUESTION = 5; static final int T_LPAREN = 6; static final int T_RPAREN = 7; static final int T_DOT = 8; static final int T_LBRACKET = 9; static final int T_BACKSOLIDUS = 10; static final int T_CARET = 11; static final int T_DOLLAR = 12; static final int T_LPAREN2 = 13; static final int T_LOOKAHEAD = 14; static final int T_NEGATIVELOOKAHEAD = 15; static final int T_LOOKBEHIND = 16; static final int T_NEGATIVELOOKBEHIND = 17; static final int T_INDEPENDENT = 18; static final int T_SET_OPERATIONS = 19; static final int T_POSIX_CHARCLASS_START = 20; static final int T_COMMENT = 21; static final int T_MODIFIERS = 22; static final int T_CONDITION = 23; static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; 58 static class ReferencePosition { 59 int refNumber; 60 int position; 61 ReferencePosition(int n, int pos) { 62 this.refNumber = n; 63 this.position = pos; 64 } 65 } 66 67 int offset; 68 String regex; 69 int regexlen; 70 int options; 71 ResourceBundle resources; 72 int chardata; 73 int nexttoken; 74 static protected final int S_NORMAL = 0; 75 static protected final int S_INBRACKETS = 1; 76 static protected final int S_INXBRACKETS = 2; 77 int context = S_NORMAL; 78 int parennumber = 1; 79 boolean hasBackReferences; 80 Vector references = null; 81 82 public RegexParser() { 83 this.setLocale(Locale.getDefault()); 84 } 85 public RegexParser(Locale locale) { 86 this.setLocale(locale); 87 } 88 89 public void setLocale(Locale locale) { 90 try { 91 this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message", locale); 92 } catch (MissingResourceException mre) { 93 throw new RuntimeException ("Installation Problem??? Couldn't load messages: " 94 +mre.getMessage()); 95 } 96 } 97 98 final ParseException ex(String key, int loc) { 99 return new ParseException(this.resources.getString(key), loc); 100 } 101 102 private final boolean isSet(int flag) { 103 return (this.options & flag) == flag; 104 } 105 106 synchronized Token parse(String regex, int options) throws ParseException { 107 this.options = options; 108 this.offset = 0; 109 this.setContext(S_NORMAL); 110 this.parennumber = 1; 111 this.hasBackReferences = false; 112 this.regex = regex; 113 if (this.isSet(RegularExpression.EXTENDED_COMMENT)) 114 this.regex = REUtil.stripExtendedComment(this.regex); 115 this.regexlen = this.regex.length(); 116 117 118 this.next(); 119 Token ret = this.parseRegex(); 120 if (this.offset != this.regexlen) 121 throw ex("parser.parse.1", this.offset); 122 if (this.references != null) { 123 for (int i = 0; i < this.references.size(); i ++) { 124 ReferencePosition position = (ReferencePosition)this.references.elementAt(i); 125 if (this.parennumber <= position.refNumber) 126 throw ex("parser.parse.2", position.position); 127 } 128 this.references.removeAllElements(); 129 } 130 return ret; 131 } 132 133 139 140 protected final void setContext(int con) { 141 this.context = con; 142 } 143 144 final int read() { 145 return this.nexttoken; 146 } 147 148 final void next() { 149 if (this.offset >= this.regexlen) { 150 this.chardata = -1; 151 this.nexttoken = T_EOF; 152 return; 153 } 154 155 int ret; 156 int ch = this.regex.charAt(this.offset++); 157 this.chardata = ch; 158 159 if (this.context == S_INBRACKETS) { 160 switch (ch) { 163 case '\\': 164 ret = T_BACKSOLIDUS; 165 if (this.offset >= this.regexlen) 166 throw ex("parser.next.1", this.offset-1); 167 this.chardata = this.regex.charAt(this.offset++); 168 break; 169 170 case '-': 171 if (this.isSet(RegularExpression.XMLSCHEMA_MODE) 172 && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { 173 this.offset++; 174 ret = T_XMLSCHEMA_CC_SUBTRACTION; 175 } else 176 ret = T_CHAR; 177 break; 178 179 case '[': 180 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) 181 && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { 182 this.offset++; 183 ret = T_POSIX_CHARCLASS_START; 184 break; 185 } default: 187 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { 188 int low = this.regex.charAt(this.offset); 189 if (REUtil.isLowSurrogate(low)) { 190 this.chardata = REUtil.composeFromSurrogates(ch, low); 191 this.offset ++; 192 } 193 } 194 ret = T_CHAR; 195 } 196 this.nexttoken = ret; 197 return; 198 } 199 200 switch (ch) { 201 case '|': ret = T_OR; break; 202 case '*': ret = T_STAR; break; 203 case '+': ret = T_PLUS; break; 204 case '?': ret = T_QUESTION; break; 205 case ')': ret = T_RPAREN; break; 206 case '.': ret = T_DOT; break; 207 case '[': ret = T_LBRACKET; break; 208 case '^': ret = T_CARET; break; 209 case '$': ret = T_DOLLAR; break; 210 case '(': 211 ret = T_LPAREN; 212 if (this.offset >= this.regexlen) 213 break; 214 if (this.regex.charAt(this.offset) != '?') 215 break; 216 if (++this.offset >= this.regexlen) 217 throw ex("parser.next.2", this.offset-1); 218 ch = this.regex.charAt(this.offset++); 219 switch (ch) { 220 case ':': ret = T_LPAREN2; break; 221 case '=': ret = T_LOOKAHEAD; break; 222 case '!': ret = T_NEGATIVELOOKAHEAD; break; 223 case '[': ret = T_SET_OPERATIONS; break; 224 case '>': ret = T_INDEPENDENT; break; 225 case '<': 226 if (this.offset >= this.regexlen) 227 throw ex("parser.next.2", this.offset-3); 228 ch = this.regex.charAt(this.offset++); 229 if (ch == '=') { 230 ret = T_LOOKBEHIND; 231 } else if (ch == '!') { 232 ret = T_NEGATIVELOOKBEHIND; 233 } else 234 throw ex("parser.next.3", this.offset-3); 235 break; 236 case '#': 237 while (this.offset < this.regexlen) { 238 ch = this.regex.charAt(this.offset++); 239 if (ch == ')') break; 240 } 241 if (ch != ')') 242 throw ex("parser.next.4", this.offset-1); 243 ret = T_COMMENT; 244 break; 245 default: 246 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') { this.offset --; 248 ret = T_MODIFIERS; 249 break; 250 } else if (ch == '(') { ret = T_CONDITION; break; 253 } 254 throw ex("parser.next.2", this.offset-2); 255 } 256 break; 257 258 case '\\': 259 ret = T_BACKSOLIDUS; 260 if (this.offset >= this.regexlen) 261 throw ex("parser.next.1", this.offset-1); 262 this.chardata = this.regex.charAt(this.offset++); 263 break; 264 265 default: 266 ret = T_CHAR; 267 } 268 this.nexttoken = ret; 269 } 270 271 280 Token parseRegex() throws ParseException { 281 Token tok = this.parseTerm(); 282 Token parent = null; 283 while (this.read() == T_OR) { 284 this.next(); if (parent == null) { 286 parent = Token.createUnion(); 287 parent.addChild(tok); 288 tok = parent; 289 } 290 tok.addChild(this.parseTerm()); 291 } 292 return tok; 293 } 294 295 298 Token parseTerm() throws ParseException { 299 int ch = this.read(); 300 if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { 301 return Token.createEmpty(); 302 } else { 303 Token tok = this.parseFactor(); 304 Token concat = null; 305 while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { 306 if (concat == null) { 307 concat = Token.createConcat(); 308 concat.addChild(tok); 309 tok = concat; 310 } 311 concat.addChild(this.parseFactor()); 312 } 314 return tok; 315 } 316 } 317 318 320 Token processCaret() throws ParseException { 321 this.next(); 322 return Token.token_linebeginning; 323 } 324 Token processDollar() throws ParseException { 325 this.next(); 326 return Token.token_lineend; 327 } 328 Token processLookahead() throws ParseException { 329 this.next(); 330 Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); 331 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 332 this.next(); return tok; 334 } 335 Token processNegativelookahead() throws ParseException { 336 this.next(); 337 Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); 338 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 339 this.next(); return tok; 341 } 342 Token processLookbehind() throws ParseException { 343 this.next(); 344 Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); 345 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 346 this.next(); return tok; 348 } 349 Token processNegativelookbehind() throws ParseException { 350 this.next(); 351 Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); 352 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 353 this.next(); return tok; 355 } 356 Token processBacksolidus_A() throws ParseException { 357 this.next(); 358 return Token.token_stringbeginning; 359 } 360 Token processBacksolidus_Z() throws ParseException { 361 this.next(); 362 return Token.token_stringend2; 363 } 364 Token processBacksolidus_z() throws ParseException { 365 this.next(); 366 return Token.token_stringend; 367 } 368 Token processBacksolidus_b() throws ParseException { 369 this.next(); 370 return Token.token_wordedge; 371 } 372 Token processBacksolidus_B() throws ParseException { 373 this.next(); 374 return Token.token_not_wordedge; 375 } 376 Token processBacksolidus_lt() throws ParseException { 377 this.next(); 378 return Token.token_wordbeginning; 379 } 380 Token processBacksolidus_gt() throws ParseException { 381 this.next(); 382 return Token.token_wordend; 383 } 384 Token processStar(Token tok) throws ParseException { 385 this.next(); 386 if (this.read() == T_QUESTION) { 387 this.next(); 388 return Token.createNGClosure(tok); 389 } else 390 return Token.createClosure(tok); 391 } 392 Token processPlus(Token tok) throws ParseException { 393 this.next(); 395 if (this.read() == T_QUESTION) { 396 this.next(); 397 return Token.createConcat(tok, Token.createNGClosure(tok)); 398 } else 399 return Token.createConcat(tok, Token.createClosure(tok)); 400 } 401 Token processQuestion(Token tok) throws ParseException { 402 this.next(); 404 Token par = Token.createUnion(); 405 if (this.read() == T_QUESTION) { 406 this.next(); 407 par.addChild(Token.createEmpty()); 408 par.addChild(tok); 409 } else { 410 par.addChild(tok); 411 par.addChild(Token.createEmpty()); 412 } 413 return par; 414 } 415 boolean checkQuestion(int off) { 416 return off < this.regexlen && this.regex.charAt(off) == '?'; 417 } 418 Token processParen() throws ParseException { 419 this.next(); 420 int p = this.parennumber++; 421 Token tok = Token.createParen(this.parseRegex(), p); 422 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 423 this.next(); return tok; 425 } 426 Token processParen2() throws ParseException { 427 this.next(); 428 Token tok = Token.createParen(this.parseRegex(), 0); 429 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 430 this.next(); return tok; 432 } 433 Token processCondition() throws ParseException { 434 if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); 436 int refno = -1; 438 Token condition = null; 439 int ch = this.regex.charAt(this.offset); 440 if ('1' <= ch && ch <= '9') { 441 refno = ch-'0'; 442 this.hasBackReferences = true; 443 if (this.references == null) this.references = new Vector (); 444 this.references.addElement(new ReferencePosition(refno, this.offset)); 445 this.offset ++; 446 if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); 447 this.offset ++; 448 } else { 449 if (ch == '?') this.offset --; this.next(); 451 condition = this.parseFactor(); 452 switch (condition.type) { 453 case Token.LOOKAHEAD: 454 case Token.NEGATIVELOOKAHEAD: 455 case Token.LOOKBEHIND: 456 case Token.NEGATIVELOOKBEHIND: 457 break; 458 case Token.ANCHOR: 459 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 460 break; 461 default: 462 throw ex("parser.factor.5", this.offset); 463 } 464 } 465 this.next(); 467 Token yesPattern = this.parseRegex(); 468 Token noPattern = null; 469 if (yesPattern.type == Token.UNION) { 470 if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); 471 noPattern = yesPattern.getChild(1); 472 yesPattern = yesPattern.getChild(0); 473 } 474 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 475 this.next(); 476 return Token.createCondition(refno, condition, yesPattern, noPattern); 477 } 478 Token processModifiers() throws ParseException { 479 int add = 0, mask = 0, ch = -1; 482 while (this.offset < this.regexlen) { 483 ch = this.regex.charAt(this.offset); 484 int v = REUtil.getOptionValue(ch); 485 if (v == 0) break; add |= v; 487 this.offset ++; 488 } 489 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 490 if (ch == '-') { 491 this.offset ++; 492 while (this.offset < this.regexlen) { 493 ch = this.regex.charAt(this.offset); 494 int v = REUtil.getOptionValue(ch); 495 if (v == 0) break; mask |= v; 497 this.offset ++; 498 } 499 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 500 } 501 Token tok; 502 if (ch == ':') { 503 this.offset ++; 504 this.next(); 505 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 506 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 507 this.next(); 508 } else if (ch == ')') { this.offset ++; 510 this.next(); 511 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 512 } else 513 throw ex("parser.factor.3", this.offset); 514 515 return tok; 516 } 517 Token processIndependent() throws ParseException { 518 this.next(); 519 Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); 520 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 521 this.next(); return tok; 523 } 524 Token processBacksolidus_c() throws ParseException { 525 int ch2; if (this.offset >= this.regexlen 527 || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) 528 throw ex("parser.atom.1", this.offset-1); 529 this.next(); 530 return Token.createChar(ch2-0x40); 531 } 532 Token processBacksolidus_C() throws ParseException { 533 throw ex("parser.process.1", this.offset); 534 } 535 Token processBacksolidus_i() throws ParseException { 536 Token tok = Token.createChar('i'); 537 this.next(); 538 return tok; 539 } 540 Token processBacksolidus_I() throws ParseException { 541 throw ex("parser.process.1", this.offset); 542 } 543 Token processBacksolidus_g() throws ParseException { 544 this.next(); 545 return Token.getGraphemePattern(); 546 } 547 Token processBacksolidus_X() throws ParseException { 548 this.next(); 549 return Token.getCombiningCharacterSequence(); 550 } 551 Token processBackreference() throws ParseException { 552 int refnum = this.chardata-'0'; 553 Token tok = Token.createBackReference(refnum); 554 this.hasBackReferences = true; 555 if (this.references == null) this.references = new Vector (); 556 this.references.addElement(new ReferencePosition(refnum, this.offset-2)); 557 this.next(); 558 return tok; 559 } 560 561 563 572 Token parseFactor() throws ParseException { 573 int ch = this.read(); 574 Token tok; 575 switch (ch) { 576 case T_CARET: return this.processCaret(); 577 case T_DOLLAR: return this.processDollar(); 578 case T_LOOKAHEAD: return this.processLookahead(); 579 case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); 580 case T_LOOKBEHIND: return this.processLookbehind(); 581 case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); 582 583 case T_COMMENT: 584 this.next(); 585 return Token.createEmpty(); 586 587 case T_BACKSOLIDUS: 588 switch (this.chardata) { 589 case 'A': return this.processBacksolidus_A(); 590 case 'Z': return this.processBacksolidus_Z(); 591 case 'z': return this.processBacksolidus_z(); 592 case 'b': return this.processBacksolidus_b(); 593 case 'B': return this.processBacksolidus_B(); 594 case '<': return this.processBacksolidus_lt(); 595 case '>': return this.processBacksolidus_gt(); 596 } 597 } 599 tok = this.parseAtom(); 600 ch = this.read(); 601 switch (ch) { 602 case T_STAR: return this.processStar(tok); 603 case T_PLUS: return this.processPlus(tok); 604 case T_QUESTION: return this.processQuestion(tok); 605 case T_CHAR: 606 if (this.chardata == '{' && this.offset < this.regexlen) { 607 608 int off = this.offset; int min = 0, max = -1; 610 611 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 612 613 min = ch -'0'; 614 while (off < this.regexlen 615 && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 616 min = min*10 +ch-'0'; 617 if (min < 0) 618 throw ex("parser.quantifier.5", this.offset); 619 } 620 } 621 else { 622 throw ex("parser.quantifier.1", this.offset); 623 } 624 625 max = min; 626 if (ch == ',') { 627 628 if (off >= this.regexlen) { 629 throw ex("parser.quantifier.3", this.offset); 630 } 631 else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 632 633 max = ch -'0'; while (off < this.regexlen 635 && (ch = this.regex.charAt(off++)) >= '0' 636 && ch <= '9') { 637 max = max*10 +ch-'0'; 638 if (max < 0) 639 throw ex("parser.quantifier.5", this.offset); 640 } 641 642 if (min > max) 643 throw ex("parser.quantifier.4", this.offset); 644 } 645 else { max = -1; 647 } 648 } 649 650 if (ch != '}') 651 throw ex("parser.quantifier.2", this.offset); 652 653 if (this.checkQuestion(off)) { tok = Token.createNGClosure(tok); 655 this.offset = off+1; 656 } else { 657 tok = Token.createClosure(tok); 658 this.offset = off; 659 } 660 661 tok.setMin(min); 662 tok.setMax(max); 663 this.next(); 665 } 666 } 667 return tok; 668 } 669 670 676 Token parseAtom() throws ParseException { 677 int ch = this.read(); 678 Token tok = null; 679 switch (ch) { 680 case T_LPAREN: return this.processParen(); 681 case T_LPAREN2: return this.processParen2(); case T_CONDITION: return this.processCondition(); case T_MODIFIERS: return this.processModifiers(); case T_INDEPENDENT: return this.processIndependent(); 685 case T_DOT: 686 this.next(); tok = Token.token_dot; 688 break; 689 690 697 case T_LBRACKET: return this.parseCharacterClass(true); 698 case T_SET_OPERATIONS: return this.parseSetOperations(); 699 700 case T_BACKSOLIDUS: 701 switch (this.chardata) { 702 case 'd': case 'D': 703 case 'w': case 'W': 704 case 's': case 'S': 705 tok = this.getTokenForShorthand(this.chardata); 706 this.next(); 707 return tok; 708 709 case 'e': case 'f': case 'n': case 'r': 710 case 't': case 'u': case 'v': case 'x': 711 { 712 int ch2 = this.decodeEscaped(); 713 if (ch2 < 0x10000) { 714 tok = Token.createChar(ch2); 715 } else { 716 tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); 717 } 718 } 719 break; 720 721 case 'c': return this.processBacksolidus_c(); 722 case 'C': return this.processBacksolidus_C(); 723 case 'i': return this.processBacksolidus_i(); 724 case 'I': return this.processBacksolidus_I(); 725 case 'g': return this.processBacksolidus_g(); 726 case 'X': return this.processBacksolidus_X(); 727 case '1': case '2': case '3': case '4': 728 case '5': case '6': case '7': case '8': case '9': 729 return this.processBackreference(); 730 731 case 'P': 732 case 'p': 733 int pstart = this.offset; 734 tok = processBacksolidus_pP(this.chardata); 735 if (tok == null) throw this.ex("parser.atom.5", pstart); 736 break; 737 738 default: 739 tok = Token.createChar(this.chardata); 740 } 741 this.next(); 742 break; 743 744 case T_CHAR: 745 if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}') 746 throw this.ex("parser.atom.4", this.offset-1); 747 tok = Token.createChar(this.chardata); 748 int high = this.chardata; 749 this.next(); 750 if (REUtil.isHighSurrogate(high) 751 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { 752 char[] sur = new char[2]; 753 sur[0] = (char)high; 754 sur[1] = (char)this.chardata; 755 tok = Token.createParen(Token.createString(new String (sur)), 0); 756 this.next(); 757 } 758 break; 759 760 default: 761 throw this.ex("parser.atom.4", this.offset-1); 762 } 763 return tok; 764 } 765 766 protected RangeToken processBacksolidus_pP(int c) throws ParseException { 767 768 this.next(); 769 if (this.read() != T_CHAR || this.chardata != '{') 770 throw this.ex("parser.atom.2", this.offset-1); 771 772 boolean positive = c == 'p'; 774 int namestart = this.offset; 775 int nameend = this.regex.indexOf('}', namestart); 776 777 if (nameend < 0) 778 throw this.ex("parser.atom.3", this.offset); 779 780 String pname = this.regex.substring(namestart, nameend); 781 this.offset = nameend+1; 782 783 return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE)); 784 } 785 786 int processCIinCharacterClass(RangeToken tok, int c) { 787 return this.decodeEscaped(); 788 } 789 790 797 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException { 798 this.setContext(S_INBRACKETS); 799 this.next(); boolean nrange = false; 801 RangeToken base = null; 802 RangeToken tok; 803 if (this.read() == T_CHAR && this.chardata == '^') { 804 nrange = true; 805 this.next(); if (useNrange) { 807 tok = Token.createNRange(); 808 } else { 809 base = Token.createRange(); 810 base.addRange(0, Token.UTF16_MAX); 811 tok = Token.createRange(); 812 } 813 } else { 814 tok = Token.createRange(); 815 } 816 int type; 817 boolean firstloop = true; 818 while ((type = this.read()) != T_EOF) { 819 if (type == T_CHAR && this.chardata == ']' && !firstloop) 820 break; 821 firstloop = false; 822 int c = this.chardata; 823 boolean end = false; 824 if (type == T_BACKSOLIDUS) { 825 switch (c) { 826 case 'd': case 'D': 827 case 'w': case 'W': 828 case 's': case 'S': 829 tok.mergeRanges(this.getTokenForShorthand(c)); 830 end = true; 831 break; 832 833 case 'i': case 'I': 834 case 'c': case 'C': 835 c = this.processCIinCharacterClass(tok, c); 836 if (c < 0) end = true; 837 break; 838 839 case 'p': 840 case 'P': 841 int pstart = this.offset; 842 RangeToken tok2 = this.processBacksolidus_pP(c); 843 if (tok2 == null) throw this.ex("parser.atom.5", pstart); 844 tok.mergeRanges(tok2); 845 end = true; 846 break; 847 848 default: 849 c = this.decodeEscaped(); 850 } } else if (type == T_POSIX_CHARCLASS_START) { 854 int nameend = this.regex.indexOf(':', this.offset); 855 if (nameend < 0) throw this.ex("parser.cc.1", this.offset); 856 boolean positive = true; 857 if (this.regex.charAt(this.offset) == '^') { 858 this.offset ++; 859 positive = false; 860 } 861 String name = this.regex.substring(this.offset, nameend); 862 RangeToken range = Token.getRange(name, positive, 863 this.isSet(RegularExpression.XMLSCHEMA_MODE)); 864 if (range == null) throw this.ex("parser.cc.3", this.offset); 865 tok.mergeRanges(range); 866 end = true; 867 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') 868 throw this.ex("parser.cc.1", nameend); 869 this.offset = nameend+2; 870 } 871 this.next(); 872 if (!end) { if (this.read() != T_CHAR || this.chardata != '-') { tok.addRange(c, c); 875 } else { 876 this.next(); if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); 878 if (type == T_CHAR && this.chardata == ']') { 879 tok.addRange(c, c); 880 tok.addRange('-', '-'); 881 } else { 882 int rangeend = this.chardata; 883 if (type == T_BACKSOLIDUS) 884 rangeend = this.decodeEscaped(); 885 this.next(); 886 tok.addRange(c, rangeend); 887 } 888 } 889 } 890 if (this.isSet(RegularExpression.SPECIAL_COMMA) 891 && this.read() == T_CHAR && this.chardata == ',') 892 this.next(); 893 } 894 if (this.read() == T_EOF) 895 throw this.ex("parser.cc.2", this.offset); 896 if (!useNrange && nrange) { 897 base.subtractRanges(tok); 898 tok = base; 899 } 900 tok.sortRanges(); 901 tok.compactRanges(); 902 907 this.setContext(S_NORMAL); 908 this.next(); 910 return tok; 911 } 912 913 916 protected RangeToken parseSetOperations() throws ParseException { 917 RangeToken tok = this.parseCharacterClass(false); 918 int type; 919 while ((type = this.read()) != T_RPAREN) { 920 int ch = this.chardata; 921 if (type == T_CHAR && (ch == '-' || ch == '&') 922 || type == T_PLUS) { 923 this.next(); 924 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); 925 RangeToken t2 = this.parseCharacterClass(false); 926 if (type == T_PLUS) 927 tok.mergeRanges(t2); 928 else if (ch == '-') 929 tok.subtractRanges(t2); 930 else if (ch == '&') 931 tok.intersectRanges(t2); 932 else 933 throw new RuntimeException ("ASSERT"); 934 } else { 935 throw ex("parser.ope.2", this.offset-1); 936 } 937 } 938 this.next(); 939 return tok; 940 } 941 942 Token getTokenForShorthand(int ch) { 943 Token tok; 944 switch (ch) { 945 case 'd': 946 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 947 ? Token.getRange("Nd", true) : Token.token_0to9; 948 break; 949 case 'D': 950 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 951 ? Token.getRange("Nd", false) : Token.token_not_0to9; 952 break; 953 case 'w': 954 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 955 ? Token.getRange("IsWord", true) : Token.token_wordchars; 956 break; 957 case 'W': 958 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 959 ? Token.getRange("IsWord", false) : Token.token_not_wordchars; 960 break; 961 case 's': 962 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 963 ? Token.getRange("IsSpace", true) : Token.token_spaces; 964 break; 965 case 'S': 966 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 967 ? Token.getRange("IsSpace", false) : Token.token_not_spaces; 968 break; 969 970 default: 971 throw new RuntimeException ("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); 972 } 973 return tok; 974 } 975 976 978 int decodeEscaped() throws ParseException { 979 if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); 980 int c = this.chardata; 981 switch (c) { 982 case 'e': c = 0x1b; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'x': 989 this.next(); 990 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 991 if (this.chardata == '{') { 992 int v1 = 0; 993 int uv = 0; 994 do { 995 this.next(); 996 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 997 if ((v1 = hexChar(this.chardata)) < 0) 998 break; 999 if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); 1000 uv = uv*16+v1; 1001 } while (true); 1002 if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); 1003 if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); 1004 c = uv; 1005 } else { 1006 int v1 = 0; 1007 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1008 throw ex("parser.descape.1", this.offset-1); 1009 int uv = v1; 1010 this.next(); 1011 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1012 throw ex("parser.descape.1", this.offset-1); 1013 uv = uv*16+v1; 1014 c = uv; 1015 } 1016 break; 1017 1018 case 'u': 1019 int v1 = 0; 1020 this.next(); 1021 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1022 throw ex("parser.descape.1", this.offset-1); 1023 int uv = v1; 1024 this.next(); 1025 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1026 throw ex("parser.descape.1", this.offset-1); 1027 uv = uv*16+v1; 1028 this.next(); 1029 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1030 throw ex("parser.descape.1", this.offset-1); 1031 uv = uv*16+v1; 1032 this.next(); 1033 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1034 throw ex("parser.descape.1", this.offset-1); 1035 uv = uv*16+v1; 1036 c = uv; 1037 break; 1038 1039 case 'v': 1040 this.next(); 1041 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1042 throw ex("parser.descape.1", this.offset-1); 1043 uv = v1; 1044 this.next(); 1045 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1046 throw ex("parser.descape.1", this.offset-1); 1047 uv = uv*16+v1; 1048 this.next(); 1049 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1050 throw ex("parser.descape.1", this.offset-1); 1051 uv = uv*16+v1; 1052 this.next(); 1053 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1054 throw ex("parser.descape.1", this.offset-1); 1055 uv = uv*16+v1; 1056 this.next(); 1057 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1058 throw ex("parser.descape.1", this.offset-1); 1059 uv = uv*16+v1; 1060 this.next(); 1061 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1062 throw ex("parser.descape.1", this.offset-1); 1063 uv = uv*16+v1; 1064 if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); 1065 c = uv; 1066 break; 1067 case 'A': 1068 case 'Z': 1069 case 'z': 1070 throw ex("parser.descape.5", this.offset-2); 1071 default: 1072 } 1073 return c; 1074 } 1075 1076 static private final int hexChar(int ch) { 1077 if (ch < '0') return -1; 1078 if (ch > 'f') return -1; 1079 if (ch <= '9') return ch-'0'; 1080 if (ch < 'A') return -1; 1081 if (ch <= 'F') return ch-'A'+10; 1082 if (ch < 'a') return -1; 1083 return ch-'a'+10; 1084 } 1085} 1086 | Popular Tags |