1 9 10 package com.opensymphony.module.sitemesh.parser; 11 12 import com.opensymphony.module.sitemesh.Page; 13 import com.opensymphony.module.sitemesh.PageParser; 14 import com.opensymphony.module.sitemesh.html.util.CharArray; 15 import com.opensymphony.module.sitemesh.util.CharArrayReader; 16 17 import java.io.IOException ; 18 import java.io.Reader ; 19 import java.util.Collections ; 20 import java.util.HashMap ; 21 import java.util.Map ; 22 23 31 public final class FastPageParser implements PageParser 32 { 33 private static final int TOKEN_NONE = -0; 34 private static final int TOKEN_EOF = -1; 35 private static final int TOKEN_TEXT = -2; 36 private static final int TOKEN_TAG = -3; 37 private static final int TOKEN_COMMENT = -4; 38 private static final int TOKEN_CDATA = -5; 39 private static final int TOKEN_SCRIPT = -6; 40 private static final int TOKEN_DOCTYPE = -7; 41 private static final int TOKEN_EMPTYTAG = -8; 42 43 private static final int STATE_EOF = -1; 44 private static final int STATE_TEXT = -2; 45 private static final int STATE_TAG = -3; 46 private static final int STATE_COMMENT = -4; 47 private static final int STATE_TAG_QUOTE = -5; 48 private static final int STATE_CDATA = -6; 49 private static final int STATE_SCRIPT = -7; 50 private static final int STATE_DOCTYPE = -8; 51 52 private static final int TAG_STATE_NONE = 0; 53 private static final int TAG_STATE_HTML = -1; 54 private static final int TAG_STATE_HEAD = -2; 55 private static final int TAG_STATE_TITLE = -3; 56 private static final int TAG_STATE_BODY = -4; 57 private static final int TAG_STATE_XML = -6; 58 private static final int TAG_STATE_XMP = -7; 59 60 67 private static final int SLASH_XML_HASH = 1518984; private static final int XML_HASH = 118807; private static final int SLASH_XMP_HASH = 1518988; private static final int XMP_HASH = 118811; private static final int HTML_HASH = 3213227; private static final int SLASH_HTML_HASH = 46618714; private static final int HEAD_HASH = 3198432; private static final int TITLE_HASH = 110371416; private static final int SLASH_TITLE_HASH = 1455941513; private static final int PARAMETER_HASH = 1954460585; private static final int META_HASH = 3347973; private static final int SLASH_HEAD_HASH = 46603919; private static final int FRAMESET_HASH = -1644953643; private static final int FRAME_HASH = 97692013; private static final int BODY_HASH = 3029410; private static final int SLASH_BODY_HASH = 46434897; private static final int CONTENT_HASH = 951530617; 85 public Page parse(char[] data) throws IOException 86 { 87 FastPage page = internalParse(new CharArrayReader(data)); 88 page.setVerbatimPage(data); 89 return page; 90 } 91 92 public Page parse(Reader reader) 93 { 94 return internalParse(reader); 95 } 96 97 private FastPage internalParse(Reader reader) 98 { 99 CharArray _buffer = new CharArray(4096); 100 CharArray _body = new CharArray(4096); 101 CharArray _head = new CharArray(512); 102 CharArray _title = new CharArray(128); 103 Map _htmlProperties = null; 104 Map _metaProperties = new HashMap (6); 105 Map _sitemeshProperties = new HashMap (6); 106 Map _bodyProperties = null; 107 108 CharArray _currentTaggedContent = new CharArray(1024); 109 String _contentTagId = null; 110 boolean tagged = false; 111 112 boolean _frameSet = false; 113 114 int _state = STATE_TEXT; 115 int _tokenType = TOKEN_NONE; 116 int _pushBack = 0; 117 int _comment = 0; 118 int _quote = 0; 119 boolean hide = false; 120 121 int state = TAG_STATE_NONE; 122 int laststate = TAG_STATE_NONE; 123 boolean doneTitle = false; 124 125 Tag tagObject = new Tag(); 127 128 while (_tokenType != TOKEN_EOF) 129 { 130 if(tagged) 131 { 132 if(_tokenType == TOKEN_TAG || _tokenType == TOKEN_EMPTYTAG) 133 { 134 if(_buffer==null || _buffer.length()==0) 135 { 136 _tokenType=TOKEN_NONE; 137 continue; 138 } 139 140 if (parseTag(tagObject, _buffer) == null) continue; 141 142 if (_buffer.compareLowerSubstr("/content")) { 144 tagged = false; 145 if(_contentTagId != null) 146 { 147 state = TAG_STATE_NONE; 148 _sitemeshProperties.put(_contentTagId, _currentTaggedContent.toString()); 149 _currentTaggedContent.setLength(0); 150 _contentTagId = null; 151 } 152 } 153 else 154 { 155 _currentTaggedContent.append('<').append(_buffer).append('>'); 156 } 157 } 158 else 159 { 160 if(_buffer.length() > 0) _currentTaggedContent.append(_buffer); 161 } 162 } 163 else 164 { 165 if(_tokenType == TOKEN_TAG || _tokenType == TOKEN_EMPTYTAG) 166 { 167 if(_buffer==null || _buffer.length()==0) 168 { 169 _tokenType=TOKEN_NONE; 170 continue; 171 } 172 173 if(parseTag(tagObject, _buffer) == null) { 174 _tokenType=TOKEN_TEXT; 175 continue; 176 } 177 178 int tagHash = _buffer.substrHashCode(); 179 180 if(state == TAG_STATE_XML || state == TAG_STATE_XMP) 181 { 182 writeTag(state, laststate, hide, _head, _buffer, _body); 183 if( (state == TAG_STATE_XML && tagHash == SLASH_XML_HASH) 184 ||(state == TAG_STATE_XMP && tagHash == SLASH_XMP_HASH) ) 185 { 186 state = laststate; 187 } 188 } 189 else 190 { 191 boolean doDefault = false; 192 switch (tagHash) { 193 case HTML_HASH: 194 if (!_buffer.compareLowerSubstr("html")) { doDefault = true; 196 break; 197 } 198 state = TAG_STATE_HTML; 199 _htmlProperties = parseProperties(tagObject, _buffer).properties; 200 break; 201 case HEAD_HASH: 202 if (!_buffer.compareLowerSubstr("head")) { doDefault = true; 204 break; 205 } 206 state = TAG_STATE_HEAD; 207 break; 208 case XML_HASH: 209 if (!_buffer.compareLowerSubstr("xml")) { doDefault = true; 211 break; 212 } 213 laststate = state; 214 writeTag(state, laststate, hide, _head, _buffer, _body); 215 state = TAG_STATE_XML; 216 break; 217 case XMP_HASH: 218 if (!_buffer.compareLowerSubstr("xmp")) { doDefault = true; 220 break; 221 } 222 laststate = state; 223 writeTag(state, laststate, hide, _head, _buffer, _body); 224 state = TAG_STATE_XMP; 225 break; 226 case TITLE_HASH: 227 if (!_buffer.compareLowerSubstr("title")) { doDefault = true; 229 break; 230 } 231 if (doneTitle) 232 { 233 hide = true; 234 } 235 else 236 { 237 laststate = state; 238 state = TAG_STATE_TITLE; 239 } 240 break; 241 case SLASH_TITLE_HASH: 242 if (!_buffer.compareLowerSubstr("/title")) { doDefault = true; 244 break; 245 } 246 if (doneTitle) 247 { 248 hide = false; 249 } 250 else 251 { 252 doneTitle = true; 253 state = laststate; 254 } 255 break; 256 case PARAMETER_HASH: 257 if (!_buffer.compareLowerSubstr("parameter")) { doDefault = true; 259 break; 260 } 261 parseProperties(tagObject, _buffer); 262 String name = (String ) tagObject.properties.get("name"); 263 String value = (String ) tagObject.properties.get("value"); 264 265 if (name != null && value != null) 266 { 267 _sitemeshProperties.put(name, value); 268 } 269 break; 270 case META_HASH: 271 if (!_buffer.compareLowerSubstr("meta")) { doDefault = true; 273 break; 274 } 275 CharArray metaDestination = state == TAG_STATE_HEAD ? _head : _body; 276 metaDestination.append('<'); 277 metaDestination.append(_buffer); 278 metaDestination.append('>'); 279 parseProperties(tagObject, _buffer); 280 name = (String ) tagObject.properties.get("name"); 281 value = (String ) tagObject.properties.get("content"); 282 283 if (name == null) 284 { 285 String httpEquiv = (String ) tagObject.properties.get("http-equiv"); 286 287 if (httpEquiv != null) 288 { 289 name = "http-equiv." + httpEquiv; 290 } 291 } 292 293 if (name != null && value != null) 294 { 295 _metaProperties.put(name, value); 296 } 297 break; 298 case SLASH_HEAD_HASH: 299 if (!_buffer.compareLowerSubstr("/head")) { doDefault = true; 301 break; 302 } 303 state = TAG_STATE_HTML; 304 break; 305 case FRAME_HASH: 306 if (!_buffer.compareLowerSubstr("frame")) { doDefault = true; 308 break; 309 } 310 _frameSet = true; 311 break; 312 case FRAMESET_HASH: 313 if (!_buffer.compareLowerSubstr("frameset")) { doDefault = true; 315 break; 316 } 317 _frameSet = true; 318 break; 319 case BODY_HASH: 320 if (!_buffer.compareLowerSubstr("body")) { doDefault = true; 322 break; 323 } 324 if (_tokenType == TOKEN_EMPTYTAG) 325 { 326 state = TAG_STATE_BODY; 327 } 328 _bodyProperties = parseProperties(tagObject, _buffer).properties; 329 break; 330 case CONTENT_HASH: 331 if (!_buffer.compareLowerSubstr("content")) { doDefault = true; 333 break; 334 } 335 state = TAG_STATE_NONE; 336 Map props = parseProperties(tagObject, _buffer).properties; 337 if (props != null) 338 { 339 tagged = true; 340 _contentTagId = (String ) props.get("tag"); 341 } 342 break; 343 case SLASH_XMP_HASH: 344 if (!_buffer.compareLowerSubstr("/xmp")) { doDefault = true; 346 break; 347 } 348 hide = false; 349 break; 350 case SLASH_BODY_HASH: 351 if (!_buffer.compareLowerSubstr("/body")) { doDefault = true; 353 break; 354 } 355 state = TAG_STATE_NONE; 356 hide = true; 357 break; 358 case SLASH_HTML_HASH: 359 if (!_buffer.compareLowerSubstr("/html")) { doDefault = true; 361 break; 362 } 363 state = TAG_STATE_NONE; 364 hide = true; 365 break; 366 default: 367 doDefault = true; 368 } 369 if (doDefault) 370 writeTag(state, laststate, hide, _head, _buffer, _body); 371 } 372 } 373 else if (!hide) 374 { 375 if (_tokenType == TOKEN_TEXT) 376 { 377 if (state == TAG_STATE_TITLE) 378 { 379 _title.append(_buffer); 380 } 381 else if (shouldWriteToHead(state, laststate)) 382 { 383 _head.append(_buffer); 384 } 385 else 386 { 387 _body.append(_buffer); 388 } 389 } 390 else if (_tokenType == TOKEN_COMMENT) 391 { 392 final CharArray commentDestination = shouldWriteToHead(state, laststate) ? _head : _body; 393 commentDestination.append("<!--"); 394 commentDestination.append(_buffer); 395 commentDestination.append("-->"); 396 } 397 else if (_tokenType == TOKEN_CDATA) 398 { 399 final CharArray commentDestination = state == TAG_STATE_HEAD ? _head : _body; 400 commentDestination.append("<![CDATA["); 401 commentDestination.append(_buffer); 402 commentDestination.append("]]>"); 403 } 404 else if (_tokenType == TOKEN_SCRIPT) 405 { 406 final CharArray commentDestination = state == TAG_STATE_HEAD ? _head : _body; 407 commentDestination.append('<'); 408 commentDestination.append(_buffer); 409 } 410 } 411 } 412 _buffer.setLength(0); 413 414 start: 415 while (true) 416 { 417 int c; 418 419 if(_pushBack != 0) 420 { 421 c = _pushBack; 422 _pushBack = 0; 423 } 424 else 425 { 426 try 427 { 428 c = reader.read(); 429 } 430 catch(IOException e) 431 { 432 _tokenType = TOKEN_EOF; 433 break start; 434 } 435 } 436 437 if(c < 0) 438 { 439 int tmpstate = _state; 440 _state = STATE_EOF; 441 442 if(_buffer.length() > 0 && tmpstate == STATE_TEXT) 443 { 444 _tokenType = TOKEN_TEXT; 445 break start; 446 } 447 else 448 { 449 _tokenType = TOKEN_EOF; 450 break start; 451 } 452 } 453 454 switch(_state) 455 { 456 case STATE_TAG: 457 { 458 int buflen = _buffer.length(); 459 460 if(c == '>') 461 { 462 if (_buffer.length() > 1 && _buffer.charAt(_buffer.length() - 1) == '/') 463 { 464 _tokenType = TOKEN_EMPTYTAG; 465 } 466 else 467 { 468 _tokenType = TOKEN_TAG; 469 } 470 _state = STATE_TEXT; 471 break start; 472 } 473 else if(c == '/') 474 { 475 _buffer.append('/'); 476 } 477 else if(c == '<' && buflen == 0) 478 { 479 _buffer.append("<<"); 480 _state = STATE_TEXT; 481 } 482 else if(c == '-' && buflen == 2 && _buffer.charAt(1) == '-' && _buffer.charAt(0) == '!') 483 { 484 _buffer.setLength(0); 485 _state = STATE_COMMENT; 486 } 487 else if(c == '[' && buflen == 7 && _buffer.charAt(0) == '!' && _buffer.charAt(1) == '[' && _buffer.compareLower("cdata", 2)) 488 { 489 _buffer.setLength(0); 490 _state = STATE_CDATA; 491 } 492 else if((c == 'e' || c == 'E') && buflen == 7 && _buffer.charAt(0) == '!' && _buffer.compareLower("doctyp", 1)) 493 { 494 _buffer.append((char)c); 495 _state = STATE_DOCTYPE; 496 } 497 else if((c == 'T' || c == 't') && buflen == 5 && _buffer.compareLower("scrip", 0)) 498 { 499 _buffer.append((char)c); 500 _state = STATE_SCRIPT; 501 } 502 503 else if(c == '"' || c == '\'') 504 { 505 _quote = c; 506 _buffer.append(( char ) c); 507 _state = STATE_TAG_QUOTE; 508 } 509 else 510 { 511 _buffer.append(( char ) c); 512 } 513 } 514 break; 515 516 case STATE_TEXT: 517 { 518 if(c == '<') 519 { 520 _state = STATE_TAG; 521 if(_buffer.length() > 0) 522 { 523 _tokenType = TOKEN_TEXT; 524 break start; 525 } 526 } 527 else 528 { 529 _buffer.append(( char ) c); 530 } 531 } 532 break; 533 534 case STATE_TAG_QUOTE: 535 { 536 if(c == '>') 537 { 538 _pushBack = c; 539 _state = STATE_TAG; 540 } 541 else 542 { 543 _buffer.append(( char ) c); 544 if(c == _quote) 545 { 546 _state = STATE_TAG; 547 } 548 } 549 } 550 break; 551 552 case STATE_COMMENT: 553 { 554 if(c == '>' && _comment >= 2) 555 { 556 _buffer.setLength(_buffer.length() - 2); 557 _comment = 0; 558 _state = STATE_TEXT; 559 _tokenType = TOKEN_COMMENT; 560 break start; 561 } 562 else if(c == '-') 563 { 564 _comment++; 565 } 566 else 567 { 568 _comment = 0; 569 } 570 571 _buffer.append(( char ) c); 572 } 573 break; 574 575 case STATE_CDATA: 576 { 577 if(c == '>' && _comment >= 2) 578 { 579 _buffer.setLength(_buffer.length() - 2); 580 _comment = 0; 581 _state = STATE_TEXT; 582 _tokenType = TOKEN_CDATA; 583 break start; 584 } 585 else if(c == ']') 586 { 587 _comment++; 588 } 589 else 590 { 591 _comment = 0; 592 } 593 594 _buffer.append(( char ) c); 595 } 596 break; 597 598 case STATE_SCRIPT: 599 { 600 _buffer.append((char) c); 601 if (c == '<') 602 { 603 _comment = 0; 604 } 605 else if ((c == '/' && _comment == 0) 606 ||((c == 's' || c == 'S' ) && _comment == 1) 607 ||((c == 'c' || c == 'C' ) && _comment == 2) 608 ||((c == 'r' || c == 'R' ) && _comment == 3) 609 ||((c == 'i' || c == 'I' ) && _comment == 4) 610 ||((c == 'p' || c == 'P' ) && _comment == 5) 611 ||((c == 't' || c == 'T' ) && _comment == 6) 612 ) 613 { 614 _comment++; 615 } 616 else if(c == '>' && _comment >= 7) 617 { 618 _comment = 0; 619 _state = STATE_TEXT; 620 _tokenType = TOKEN_SCRIPT; 621 break start; 622 } 623 } 624 break; 625 626 case STATE_DOCTYPE: 627 { 628 _buffer.append((char) c); 629 if (c == '>') 630 { 631 _state = STATE_TEXT; 632 _tokenType = TOKEN_DOCTYPE; 633 break start; 634 } 635 else { 636 _comment = 0; 637 } 638 } 639 break; 640 } 641 } 642 } 643 644 _currentTaggedContent = null; 646 _buffer = null; 647 648 return new FastPage(_sitemeshProperties, 649 _htmlProperties, 650 _metaProperties, 651 _bodyProperties, 652 _title.toString().trim(), 653 _head.toString().trim(), 654 _body.toString().trim(), 655 _frameSet); 656 } 657 658 private static void writeTag(int state, int laststate, boolean hide, CharArray _head, CharArray _buffer, CharArray _body) { 659 if (!hide) 660 { 661 if (shouldWriteToHead(state, laststate)) 662 { 663 _head.append('<').append(_buffer).append('>'); 664 } 665 else 666 { 667 _body.append('<').append(_buffer).append('>'); 668 } 669 } 670 } 671 672 private static boolean shouldWriteToHead(int state, int laststate) 673 { 674 return state == TAG_STATE_HEAD 675 ||(laststate == TAG_STATE_HEAD && (state == TAG_STATE_XML || state == TAG_STATE_XMP)); 676 } 677 678 689 private Tag parseTag(Tag tag, CharArray buf) 690 { 691 int len = buf.length(); 692 int idx = 0; 693 int begin; 694 695 while (idx < len && Character.isWhitespace(buf.charAt(idx))) idx++; 697 698 if(idx == len) return null; 699 700 begin = idx; 702 while (idx < len && !Character.isWhitespace(buf.charAt(idx))) idx++; 703 704 buf.setSubstr(begin, buf.charAt(idx - 1) == '/' ? idx - 1 : idx); 707 708 tag.nameEndIdx = idx; 710 711 return tag; 712 } 713 714 727 private static Tag parseProperties(Tag tag, CharArray buffer) 728 { 729 int len = buffer.length(); 730 int idx = tag.nameEndIdx; 731 732 tag.properties = Collections.EMPTY_MAP; 734 int begin; 735 while (idx < len) 736 { 737 while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++; 739 740 if(idx == len) continue; 741 742 begin = idx; 743 if(buffer.charAt(idx) == '"') 744 { 745 idx++; 746 while (idx < len && buffer.charAt(idx) != '"') idx++; 747 if(idx == len) continue; 748 idx++; 749 } 750 else if(buffer.charAt(idx) == '\'') 751 { 752 idx++; 753 while (idx < len && buffer.charAt(idx) != '\'') idx++; 754 if(idx == len) continue; 755 idx++; 756 } 757 else 758 { 759 while (idx < len && !Character.isWhitespace(buffer.charAt(idx)) && buffer.charAt(idx) != '=') idx++; 760 } 761 762 buffer.setSubstr(begin, idx); 764 765 if(idx < len && Character.isWhitespace(buffer.charAt(idx))) 766 { 767 while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++; 768 } 769 770 if(idx == len || buffer.charAt(idx) != '=') continue; 771 772 idx++; 773 774 if(idx == len) continue; 775 776 while(idx < len && (buffer.charAt(idx) == '\n' || buffer.charAt(idx) == '\r')) idx++; 777 778 if(buffer.charAt(idx) == ' ') 779 { 780 while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++; 781 if(idx == len || (buffer.charAt(idx) != '"' && buffer.charAt(idx) != '"')) continue; 782 } 783 784 begin = idx; 785 int end; 786 if(buffer.charAt(idx) == '"') 787 { 788 idx++; 789 begin = idx; 790 while (idx < len && buffer.charAt(idx) != '"') idx++; 791 if(idx == len) continue; 792 end = idx; 793 idx++; 794 } 795 else if(buffer.charAt(idx) == '\'') 796 { 797 idx++; 798 begin = idx; 799 while (idx < len && buffer.charAt(idx) != '\'') idx++; 800 if(idx == len) continue; 801 end = idx; 802 idx++; 803 } 804 else 805 { 806 while (idx < len && !Character.isWhitespace(buffer.charAt(idx))) idx++; 807 end = idx; 808 } 809 String name = buffer.getLowerSubstr(); 811 String value = buffer.substring(begin, end); 812 813 tag.addProperty(name, value); 814 } 815 return tag; 816 } 817 818 private class Tag 819 { 820 public int nameEndIdx = 0; 823 824 public Map properties = Collections.EMPTY_MAP; 827 828 832 public void addProperty(String name, String value) 833 { 834 if(properties==Collections.EMPTY_MAP) 835 { 836 properties = new HashMap (8); 837 } 838 properties.put(name, value); 839 } 840 } 841 } 842 | Popular Tags |