1 21 22 package org.armedbear.j; 23 24 import java.io.BufferedReader ; 25 import java.io.IOException ; 26 import java.io.InputStream ; 27 import java.io.InputStreamReader ; 28 import java.io.PushbackReader ; 29 import java.io.Reader ; 30 import java.util.ArrayList ; 31 import java.util.Hashtable ; 32 import java.util.List ; 33 import java.util.Stack ; 34 35 public final class WebLoader implements WebConstants 36 { 37 private PushbackReader reader; 38 private final FastStringBuffer textBuffer = new FastStringBuffer(); 39 private final Stack indentStack = new Stack (); 40 private final Stack tableStack = new Stack (); 41 private Table currentTable; 42 private int sourceOffset; 43 private int offset; 44 private final int maxChars = 80; 45 private LineSegmentList segments; 46 private LineSequence lines; 47 private final Hashtable refs = new Hashtable (); 48 private int indentLevel; 49 private File file; 50 51 public WebLoader(File file) 52 { 53 this.file = file; 54 if (file.getEncoding() == null) 55 file.setEncoding("iso-8859-1"); 56 Debug.assertTrue(file.isLocal()); 57 } 58 59 public WebLoader(Reader reader) 60 { 61 this.reader = new PushbackReader (new BufferedReader (reader)); 62 } 63 64 public final Hashtable getRefs() 65 { 66 return refs; 67 } 68 69 public LineSequence load() 70 { 71 try { 72 loadInternal(); 73 } 74 catch (EncodingChangeException e) { 75 Log.debug("encoding change!"); 76 Log.debug("new encoding = |" + e.getNewEncoding() + "|"); 77 file.setEncoding(e.getNewEncoding()); 78 reader = null; 79 try { 80 loadInternal(); 81 } 82 catch (EncodingChangeException ex) { 83 Log.error(ex); 84 } 85 } 86 if (lines.getFirstLine() == null) 88 lines.appendLine(new WebLine(sourceOffset)); 89 return lines; 90 } 91 92 private void loadInternal() throws EncodingChangeException 93 { 94 if (reader == null) { 95 Debug.assertTrue(file != null); 96 String encoding = file.getEncoding(); 97 if (encoding == null) 98 encoding = Editor.preferences().getStringProperty(Property.DEFAULT_ENCODING); 99 try { 100 InputStream inputStream = file.getInputStream(); 101 reader = new PushbackReader (new BufferedReader (new InputStreamReader (inputStream, encoding))); 102 } 103 catch (IOException e) { 104 Log.error(e); 105 return; 106 } 107 } 108 lines = new LineSequence(); 109 sourceOffset = 0; 110 try { 111 int c; 112 while ((c = reader.read()) >= 0) { 113 if (c != '\r') 116 ++sourceOffset; 117 switch (c) { 118 case '<': 119 processMarkup(); 120 break; 121 case '&': 122 processEntity(); 123 break; 124 default: 125 doChar((char)c); 126 break; 127 } 128 } 129 flushLine(); 130 } 131 catch (IOException e) { 132 Log.error(e); 133 } 134 } 135 136 private boolean bold; 137 private boolean strong; 138 private boolean italic; 139 private boolean emphasis; 140 private boolean heading; 141 private boolean h1; 142 private boolean center; 143 private boolean preformatted; 144 private boolean whitespace; 145 private Link link; 146 147 private final boolean centered() 148 { 149 return center || h1; 150 } 151 152 private void processMarkup() throws EncodingChangeException 153 { 154 final String tag = gatherTag(); 155 if (tag.length() < 3) { 156 doText(tag); 157 return; 158 } 159 char c = tag.charAt(1); 160 if (c == '/') { 161 if (!Character.isLetter(tag.charAt(2))) { 162 doText(tag); 163 return; 164 } 165 } else { 166 if (c == '!') { 167 if (tag.equals("<!--")) 169 skipComment(); 170 return; 171 } 172 if (c == '?') { 173 return; 175 } 176 if (!Character.isLetter(c)) { 177 doText(tag); 178 return; 179 } 180 } 181 final String tagName = Utilities.getTagName(tag).toLowerCase().intern(); 182 183 if (tagName == "applet") { 185 skipTag("/applet"); 186 return; 187 } 188 if (tagName == "form") { 189 flushLine(); 190 textBuffer.append("[Form]"); 191 flushSegment(null, FORMAT_DISABLED); 192 flushLine(); 193 return; 194 } 195 if (tagName == "/form") { 196 flushLine(); 197 textBuffer.append("[End Form]"); 198 flushSegment(null, FORMAT_DISABLED); 199 newLine(); 200 return; 201 } 202 if (tagName == "input") { 203 List attributes = getAttributes(tag); 204 String type = getAttribute(attributes, "type"); 205 if (type != null) { 206 if (type.equalsIgnoreCase("submit")) { 207 flushSegment(); 208 String value = getAttribute(attributes, "value"); 209 if (value == null) 210 value = "Submit"; textBuffer.append('['); 212 textBuffer.append(value); 213 textBuffer.append(']'); 214 flushSegment(null, FORMAT_DISABLED); 215 } else if (type.equalsIgnoreCase("image")) { 216 flushSegment(); 217 textBuffer.append("[Image]"); 218 flushSegment(null, FORMAT_DISABLED); 219 } 220 } 221 return; 222 } 223 if (tagName == "object") { 224 skipTag("/object"); 225 return; 226 } 227 if (tagName == "xml") { 228 skipTag("/xml"); 229 return; 230 } 231 if (tagName == "script") { 232 skipScript(); 233 return; 234 } 235 236 if (tagName == "title") { 237 processTitle(); 238 return; 239 } 240 if (tagName == "b") { 241 flushSegment(); 242 if (bold) { 243 bold = false; 245 } else { 246 bold = true; 247 } 248 return; 249 } 250 if (tagName == "/b") { 251 flushSegment(); 252 bold = false; 253 return; 254 } 255 if (tagName == "strong") { 256 flushSegment(); 257 strong = true; 258 return; 259 } 260 if (tagName == "/strong") { 261 flushSegment(); 262 strong = false; 263 return; 264 } 265 if (tagName == "i") { 266 flushSegment(); 267 italic = true; 268 return; 269 } 270 if (tagName == "/i") { 271 flushSegment(); 272 italic = false; 273 return; 274 } 275 if (tagName == "em") { 276 flushSegment(); 277 emphasis = true; 278 return; 279 } 280 if (tagName == "/em") { 281 flushSegment(); 282 emphasis = false; 283 return; 284 } 285 if (tagName == "q" || tagName == "/q") { 286 maybeIndent(); 288 textBuffer.append('"'); 289 return; 290 } 291 if (tagName == "a") { 292 if (link != null) 293 processEndAnchor(); 295 else 296 processAnchor(tag); 297 return; 298 } 299 if (tagName == "/a") { 300 processEndAnchor(); 301 return; 302 } 303 if (tagName == "h1") { 304 newLine(); 305 heading = true; 306 h1 = true; 307 return; 308 } 309 if (tagName == "/h1") { 310 newLine(); 311 heading = false; 312 h1 = false; 313 return; 314 } 315 if (tagName == "h2" || 316 tagName == "h3" || 317 tagName == "h4" || 318 tagName == "h5" || 319 tagName == "h6") { 320 newLine(); 321 heading = true; 322 return; 323 } 324 if (tagName == "/h2" || 325 tagName == "/h3" || 326 tagName == "/h4" || 327 tagName == "/h5" || 328 tagName == "/h6") { 329 newLine(); 330 heading = false; 331 return; 332 } 333 if (tagName == "br") { 334 if (!flushLine()) { 337 lines.appendLine(new WebLine(sourceOffset)); 338 ++offset; 339 } 340 return; 341 } 342 if (tagName == "div") { 343 flushLine(); 344 return; 345 } 346 if (tagName == "/div") { 347 flushLine(); 348 return; 349 } 350 if (tagName == "p") { 351 newLine(); 352 return; 353 } 354 if (tagName == "pre") { 355 flushLine(); 356 preformatted = true; 357 return; 358 } 359 if (tagName == "/pre") { 360 newLine(); 361 preformatted = false; 362 return; 363 } 364 if (tagName == "blockquote") { 365 newLine(); 366 indentStack.push("blockquote"); 367 ++indentLevel; 368 return; 369 } 370 if (tagName == "/blockquote") { 371 newLine(); 372 if (!indentStack.empty()) { 373 String s = (String ) indentStack.pop(); 374 --indentLevel; 375 if (!s.equals("blockquote")) 376 Log.error("**** /blockquote: stack imbalance"); 377 } 378 return; 379 } 380 if (tagName == "dl") { 382 newLine(); 383 indentStack.push("dl"); 384 return; 385 } 386 if (tagName == "/dl") { 388 newLine(); 389 while (!indentStack.empty()) { 391 String s = (String ) indentStack.peek(); 392 if (s.equals("dd")) { 393 indentStack.pop(); 394 --indentLevel; 395 } else if (s.equals("dl")) { 396 indentStack.pop(); 397 break; 398 } else { 399 break; 401 } 402 } 403 return; 404 } 405 if (tagName == "dd") { 407 flushLine(); 408 if (!indentStack.empty()) { 409 String s = (String ) indentStack.peek(); 410 if (s.equals("dl")) 411 ; 412 else if (s.equals("dd")) { 413 return; 415 } else 416 Log.error("**** dd: top of stack is " + s); 417 } else 418 Log.error("**** dd: indentStack unexpectedly empty"); 419 indentStack.push("dd"); 420 ++indentLevel; 421 return; 422 } 423 if (tagName == "dt") { 425 flushLine(); 426 if (!indentStack.empty()) { 427 String s = (String ) indentStack.peek(); 428 if (s.equals("dd")) { 429 indentStack.pop(); --indentLevel; 431 } else if (s.equals("dl")) 432 ; 433 else 434 Log.error("**** dt: top of stack is " + s); 435 } else 436 Log.error("**** dt: indentStack unexpectedly empty"); 437 return; 438 } 439 if (tagName == "img") { 440 processImg(tag); 441 return; 442 } 443 if (tagName == "center") { 444 flushLine(); 445 center = true; 446 return; 447 } 448 if (tagName == "/center") { 449 flushLine(); 450 center = false; 451 return; 452 } 453 if (tagName == "hr") { 454 flushLine(); 455 link = null; 456 for (int i = 0; i < maxChars(); i++) 457 textBuffer.append('-'); 458 flushLine(); 459 return; 460 } 461 if (tagName == "ul") { 462 newLine(); 463 indentStack.push("ul"); 464 ++indentLevel; 465 } 466 if (tagName == "/ul") { 468 newLine(); 469 if (!indentStack.empty()) { 470 indentStack.pop(); 471 --indentLevel; 472 } 473 } 474 if (tagName == "li") { 476 flushLine(); 477 if (indentStack.size() > 0) { 478 textBuffer.append(Utilities.spaces(getIndent())); 479 } else { 480 textBuffer.append(Utilities.spaces(4)); 481 } 482 if (textBuffer.length() >= 2) 483 textBuffer.setCharAt(textBuffer.length() - 2, '\u2022'); 484 flushSegment(null, 0); 485 return; 486 } 487 if (tagName == "style") { 488 skipTag("/style"); 489 return; 490 } 491 if (tagName == "table") { 492 newLine(); 493 tableStack.push(currentTable); 494 currentTable = new Table(); 495 return; 496 } 497 if (tagName == "/table") { 498 flushLine(); 499 if (!tableStack.empty()) 500 currentTable = (Table) tableStack.pop(); 501 else 502 Log.error("**** /table: table stack imbalance source offset = " + sourceOffset); 503 return; 504 } 505 if (tagName == "tr") { 507 flushLine(); 508 if (currentTable != null) 509 currentTable.nextRow(); 510 else 511 Log.error("**** tr: currentTable is null source offset = " + sourceOffset); 512 return; 513 } 514 if (tagName == "td" || tagName == "th") { 516 flushSegment(); 517 if (currentTable != null) { 518 currentTable.nextColumn(); 519 int currentOffset = getCurrentOffset(); 520 int numSpaces = 1; 523 if (currentTable.getColumnIndex() == 0 || currentOffset == 0) 524 numSpaces = 0; 525 int minimumOffset = currentTable.getMinimumOffset(); 526 if (minimumOffset > 0) { 527 if (currentOffset < minimumOffset) 528 numSpaces = minimumOffset - currentOffset; 529 } 530 textBuffer.append(Utilities.spaces(numSpaces)); 531 flushSegment(null, FORMAT_WHITESPACE); 532 String s = getAttribute(tag, "width"); 533 if (s != null) { 534 if (s.endsWith("%")) { 535 s = s.substring(0, s.length()-1).trim(); 536 if (s.length() > 0) { 537 try { 538 int percent = Integer.parseInt(s); 539 int width = maxChars() * percent / 100; 540 currentTable.setColumnWidth(width); 541 } 542 catch (NumberFormatException e) { 543 Log.error(e); 544 } 545 } 546 } else 547 ; } 549 } else 550 Log.error("**** td: currentTable is null"); 551 return; 552 } 553 if (tagName == "meta") { 554 if (file == null) 558 return; 559 String encoding = file.getEncoding(); 560 if (encoding != null) { 563 if (encoding.equals("UnicodeBig") || encoding.equals("UnicodeLittle")) 564 return; 565 } 566 List attributes = getAttributes(tag); 567 String httpEquiv = getAttribute(attributes, "http-equiv"); 568 if (httpEquiv != null) { 569 if (httpEquiv.toLowerCase().equals("content-type")) { 570 String contentType = getAttribute(attributes, "content"); 571 if (contentType != null) { 572 String charset = 573 Utilities.getCharsetFromContentType(contentType); 574 Log.debug("charset = |" + charset + "|"); 575 if (charset != null && charset.length() > 0) { 576 String newEncoding = 577 Utilities.getEncodingFromCharset(charset); 578 Log.debug("new encoding = " + newEncoding); 579 if (!newEncoding.equalsIgnoreCase(encoding)) 580 throw new EncodingChangeException(newEncoding); 581 Log.debug("no encoding change"); 582 } 583 } 584 } 585 } 586 return; 587 } 588 } 589 590 private void processTitle() 591 { 592 FastStringBuffer sb = new FastStringBuffer(); 593 try { 594 int c; 595 while ((c = reader.read()) >= 0) { 596 if (c != '\r') 597 ++sourceOffset; 598 if (c == '<') { 599 String tag = gatherTag(); 600 if (!isTag(tag, "/title")) 601 Log.error("processTitle unexpected tag " + tag); 602 break; 603 } else if (c == '&') { 604 String entity = gatherEntity(); 605 sb.append(substituteEntity(entity)); 606 } else 607 sb.append((char)c); 608 } 609 } 610 catch (IOException e) { 611 Log.error(e); 612 } 613 String title = sb.toString().trim(); 614 if (lines.getFirstLine() == null) { 615 if (textBuffer.length() == 0) { 616 if (title.length() < maxChars()) 617 textBuffer.append(Utilities.spaces(maxChars() - title.length())); 618 textBuffer.append(title); 619 flushLine(); 620 } 621 } 622 } 623 624 private void processAnchor(String tag) 625 { 626 flushSegment(); 627 List attributes = getAttributes(tag); 628 if (attributes != null) { 629 for (int i = 0; i < attributes.size(); i++) { 630 StringPair pair = (StringPair) attributes.get(i); 631 if (pair.first.equals("href")) 632 link = new Link(pair.second.trim()); 633 else if (pair.first.equals("name")) 634 addRef(pair.second, offset); 635 } 636 } 637 } 638 639 private void processEndAnchor() 640 { 641 boolean appendSpace = false; 642 while (textBuffer.toString().endsWith(" ")) { 643 appendSpace = true; 644 textBuffer.setLength(textBuffer.length() - 1); 645 } 646 flushSegment(); 647 link = null; 648 if (appendSpace) { 649 textBuffer.append(' '); 650 flushSegment(); 651 } 652 } 653 654 private void processImg(String tag) 655 { 656 flushSegment(); 657 List attributes = getAttributes(tag); 658 String alt = getAttribute(attributes, "alt"); 659 String src = getAttribute(attributes, "src"); 660 String width = getAttribute(attributes, "width"); 661 String height = getAttribute(attributes, "height"); 662 int w = 0; 663 int h = 0; 664 if (width != null) { 665 try { 666 w = Integer.parseInt(width); 667 } 668 catch (NumberFormatException e) {} 669 } 670 if (height != null) { 671 try { 672 h = Integer.parseInt(height); 673 } 674 catch (NumberFormatException e) {} 675 } 676 ImageLink imageLink = null; 678 if (src != null && src.length() > 0) { 679 String lower = src.toLowerCase(); 680 if (lower.endsWith(".jpg") || lower.endsWith(".gif") || lower.endsWith(".png")) { 681 if (w >= 100 && h >= 100) 683 imageLink = new ImageLink(src); 684 } 685 } 686 if (imageLink != null) { 687 FastStringBuffer sb = new FastStringBuffer("[IMAGE"); 688 if (width != null && height != null) { 689 sb.append(' '); 690 sb.append(width); 691 sb.append('x'); 692 sb.append(height); 693 } 694 sb.append(']'); 695 if (alt != null && (alt = alt.trim()).length() > 0) { 696 sb.append(' '); 697 sb.append(alt); 698 } 699 imageLink.setText(sb.toString()); 700 textBuffer.append(imageLink.getText()); 701 flushSegment(imageLink, FORMAT_LINK); 702 } 703 if (segments == null || segments.size() == 0) { 706 return; 708 } 709 FastStringBuffer sb = new FastStringBuffer(); 710 for (int i = 0; i < segments.size(); i++) { 711 HtmlLineSegment segment = (HtmlLineSegment) segments.getSegment(i); 712 sb.append(segment.getText()); 713 } 714 if (sb.length() == 0 || sb.charAt(sb.length()-1) == ' ') 715 return; 716 textBuffer.append(' '); 718 flushSegment(null, FORMAT_WHITESPACE); 719 } 720 721 private final void addRef(String ref, int offset) 722 { 723 refs.put(ref, new Integer (offset)); 724 } 725 726 private static final String getAttribute(String tag, String attributeName) 727 { 728 return getAttribute(getAttributes(tag), attributeName); 729 } 730 731 private static String getAttribute(List attributes, String attributeName) 732 { 733 if (attributes != null) { 734 for (int i = attributes.size()-1; i >= 0; i--) { 735 StringPair pair = (StringPair) attributes.get(i); 736 if (pair.first.equals(attributeName)) 737 return pair.second; 738 } 739 } 740 return null; 741 } 742 743 private static List getAttributes(String tag) 744 { 745 final int NEUTRAL = 0; 746 final int ATTRIBUTE_NAME = 1; 747 final int SPACE_BEFORE_EQ = 2; 748 final int SPACE_AFTER_EQ = 3; 749 final int ATTRIBUTE_VALUE = 4; 750 751 int state = NEUTRAL; 752 FastStringBuffer sb = new FastStringBuffer(); 753 String name = null; 754 String value = null; 755 ArrayList attributes = null; 756 char delim = 0; 757 758 final int limit = tag.length(); 759 int i; 760 for (i = 0; i < limit; i++) { 762 char c = tag.charAt(i); 763 if (c == '>') 764 return null; 765 if (Character.isWhitespace(c)) { 766 ++i; 767 break; 768 } 769 } 770 771 for (; i < limit; i++) { 772 char c = tag.charAt(i); 773 switch (state) { 774 case NEUTRAL: 775 if (Character.isWhitespace(c)) 776 ; 777 else { 778 sb.setLength(0); 779 sb.append(c); 780 state = ATTRIBUTE_NAME; 781 } 782 break; 783 case ATTRIBUTE_NAME: 784 if (c == '=') { 785 name = sb.toString().toLowerCase(); 786 sb.setLength(0); 787 state = SPACE_AFTER_EQ; 788 } else if (Character.isWhitespace(c)) { 789 name = sb.toString().toLowerCase(); 790 sb.setLength(0); 791 state = SPACE_BEFORE_EQ; 792 } else 793 sb.append(c); 794 break; 795 case SPACE_BEFORE_EQ: 796 if (Character.isWhitespace(c)) 797 ; 798 else if (c == '=') 799 state = SPACE_AFTER_EQ; 800 else { 801 sb.setLength(0); 803 state = NEUTRAL; 804 if (attributes == null) 805 attributes = new ArrayList (); 806 attributes.add(new StringPair(name, "")); 807 name = value = null; 808 } 809 break; 810 case SPACE_AFTER_EQ: 811 if (Character.isWhitespace(c)) 812 ; 813 else if ( c == '"' || c == '\'') { 814 delim = c; 815 sb.setLength(0); 816 state = ATTRIBUTE_VALUE; 817 } else { 818 delim = 0; 819 sb.setLength(0); 820 sb.append(c); 821 state = ATTRIBUTE_VALUE; 822 } 823 break; 824 case ATTRIBUTE_VALUE: 825 if (delim != 0) { 826 if (c == delim) { 827 value = sb.toString(); 828 sb.setLength(0); 829 state = NEUTRAL; 830 if (attributes == null) 831 attributes = new ArrayList (); 832 attributes.add(new StringPair(name, value)); 833 name = value = null; 834 } else if (c == '&') { 835 FastStringBuffer sbEntity = new FastStringBuffer(); 836 sbEntity.append('&'); 837 for (++i; i < limit; i++) { 838 c = tag.charAt(i); 839 if (c == delim) { 840 sb.append(sbEntity.toString()); 842 --i; 844 break; 845 } 846 sbEntity.append(c); 847 if (c == ';') { 848 sb.append(substituteEntity(sbEntity.toString())); 849 break; 850 } 851 } 852 } else 853 sb.append(c); 854 } else { 855 if (c == '>' || Character.isWhitespace(c)) { 857 value = sb.toString(); 858 sb.setLength(0); 859 state = NEUTRAL; 860 if (attributes == null) 861 attributes = new ArrayList (); 862 attributes.add(new StringPair(name, value)); 863 name = value = null; 864 } else if (c == '&') { 865 FastStringBuffer sbEntity = new FastStringBuffer(); 866 sbEntity.append('&'); 867 for (++i; i < limit; i++) { 868 c = tag.charAt(i); 869 if (c == ' ' || c == '>') { 870 --i; 872 break; 874 } 875 sbEntity.append(c); 876 if (c == ';') 877 break; 878 } 879 sb.append(substituteEntity(sbEntity.toString())); 880 } else 881 sb.append(c); 882 } 883 break; 884 } 885 } 886 887 return attributes; 888 } 889 890 private static boolean isTag(String s, String tagName) 892 { 893 Debug.assertTrue(tagName.indexOf('<') < 0); 894 Debug.assertTrue(tagName.indexOf('>') < 0); 895 Debug.assertTrue(tagName.indexOf(' ') < 0); 896 897 if (s == null || s.length() < 3) 899 return false; 900 if (s.charAt(0) != '<') 901 return false; 902 int length = tagName.length(); 903 if (s.length() < length + 2) 904 return false; 905 if (!s.regionMatches(true, 1, tagName, 0, length)) 906 return false; 907 char c = s.charAt(length + 1); 909 return c == '>' || Character.isWhitespace(c); 910 } 911 912 private String gatherTag() 913 { 914 final int TAG_NAME = 0; 915 final int NEUTRAL = 1; 916 final int ATTRIBUTE_NAME = 2; 917 final int SPACE_BEFORE_EQ = 3; 918 final int SPACE_AFTER_EQ = 4; 919 final int ATTRIBUTE_VALUE = 5; 920 final int MARKED_SECTION = 6; 921 final int BANG = 7; 922 final int INVALID = 8; 923 924 FastStringBuffer sb = new FastStringBuffer(256); 925 sb.append('<'); 926 int length = 1; 927 int state = TAG_NAME; 928 char delim = 0; 929 930 int ch; 931 932 try { 933 while ((ch = reader.read()) >= 0) { 934 char c = (char) ch; 935 if (c == '<') { 936 if (state != ATTRIBUTE_VALUE || delim == 0) { 939 Log.error("unexpected '<' sourceOffset = " + sourceOffset); 940 reader.unread(c); 941 return sb.toString(); 942 } 943 } 944 if (c != '\r') 945 ++sourceOffset; 946 if (c <= ' ') { 948 if (length == 1) 949 continue; 950 if (length == 2 && sb.charAt(1) == '/') 951 continue; 952 } 953 sb.append(c); 954 ++length; 955 switch (state) { 956 case TAG_NAME: 957 if (c == '>') { 958 return sb.toString(); 960 } else if (Character.isWhitespace(c)) { 961 state = NEUTRAL; 963 } else if (length == 2 && c == '!') { 964 state = BANG; 965 } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':') { 966 ; } else if (length == 2 && (c == '/' || c == '!')) { 968 ; } else if (length > 2 && ((c >= '0' && c <= '9') || c == '-' || c == '.')) { 970 ; } else { 972 Log.error("invalid tag sourceOffset = " + sourceOffset); 974 state = INVALID; 975 } 976 break; 977 case BANG: 978 if (c == '>') { 979 return sb.toString(); 980 } else if (length == 4 && sb.toString().equals("<!--")) { 981 return sb.toString(); 983 } else if (length == 3 && sb.toString().equals("<![")) { 984 state = MARKED_SECTION; 985 } 986 break; 987 case NEUTRAL: 988 if (c == '>') 989 return sb.toString(); 990 else if (!Character.isWhitespace(c)) 991 state = ATTRIBUTE_NAME; 992 break; 993 case ATTRIBUTE_NAME: 994 if (c == '>') 995 return sb.toString(); 996 else if (c == '=') 997 state = SPACE_AFTER_EQ; 998 else if (Character.isWhitespace(c)) 999 state = SPACE_BEFORE_EQ; 1000 break; 1001 case SPACE_BEFORE_EQ: 1002 if (c == '>') 1003 return sb.toString(); 1004 else if (Character.isWhitespace(c)) 1005 ; 1006 else if (c == '=') 1007 state = SPACE_AFTER_EQ; 1008 else { 1009 state = NEUTRAL; 1011 } 1012 break; 1013 case SPACE_AFTER_EQ: 1014 if (c == '>') 1015 return sb.toString(); 1016 else if (Character.isWhitespace(c)) 1017 ; 1018 else if ( c == '"' || c == '\'') { 1019 delim = c; 1020 state = ATTRIBUTE_VALUE; 1021 } else { 1022 delim = 0; 1023 state = ATTRIBUTE_VALUE; 1024 } 1025 break; 1026 case ATTRIBUTE_VALUE: 1027 if (delim != 0) { 1028 if (c == delim) 1029 state = NEUTRAL; 1030 } else { 1031 if (c == '>') 1033 return sb.toString(); 1034 else if (Character.isWhitespace(c)) 1035 state = NEUTRAL; 1036 } 1037 break; 1038 case MARKED_SECTION: 1039 if (c == '>') { 1040 if (sb.toString().endsWith("]>")) 1041 return sb.toString(); 1042 } 1043 break; 1044 case INVALID: 1045 if (c == '>') { 1046 Log.error("invalid tag |" + sb.toString() + 1047 "| sourceOffset = " + sourceOffset); 1048 return sb.toString(); 1049 } 1050 break; 1051 } 1052 } 1053 } 1054 catch (IOException e) { 1055 Log.error(e); 1056 } 1057 1058 return sb.toString(); 1059 } 1060 1061 private void processEntity() 1062 { 1063 String entity = gatherEntity(); 1064 doText(substituteEntity(entity)); 1065 } 1066 1067 private String gatherEntity() 1068 { 1069 FastStringBuffer sb = new FastStringBuffer('&'); 1070 try { 1071 int c; 1072 while ((c = reader.read()) >= 0) { 1073 if (c == '<' || c == '&') { 1074 reader.unread(c); 1075 break; 1076 } 1077 if (c != '\r') 1078 ++sourceOffset; 1079 sb.append((char) c); 1080 if (c == ';') 1081 break; 1082 if (c == ' ') 1083 break; 1084 } 1085 } 1086 catch (IOException e) { 1087 Log.error(e); 1088 } 1089 return sb.toString(); 1090 } 1091 1092 private static String substituteEntity(String entity) 1093 { 1094 final int length = entity.length(); 1095 if (length < 2) 1096 return entity; 1097 if (entity.equals("& ")) 1098 return entity; if (entity.charAt(1) == '#') { 1100 String s; 1102 if (entity.charAt(length - 1) == ';') 1103 s = entity.substring(2, length - 1); 1104 else 1105 s = entity.substring(2); 1106 1107 int n = -1; 1108 try { 1109 n = Integer.parseInt(s); 1110 } 1111 catch (NumberFormatException e) {} 1112 1113 if (n >= 0) { 1114 switch (n) { 1115 case 145: case 146: return "'"; 1118 case 147: case 148: return "\""; 1121 case 149: return String.valueOf((char)8226); 1123 case 150: return "-"; 1125 case 151: return "--"; 1127 case 153: 1128 return "(TM)"; 1129 case 174: 1130 return "(R)"; 1131 default: 1132 return String.valueOf((char)n); 1133 } 1134 } 1135 } 1136 1137 String s; 1139 if (entity.charAt(length - 1) == ';') 1140 s = entity.substring(1, length-1).intern(); 1141 else 1142 s = entity.substring(1).intern(); 1143 1144 if (s == "quot") 1145 return "\""; 1146 else if (s == "trade") return "(TM)"; 1148 else if (s == "nbsp") 1149 return String.valueOf((char)160); 1150 else if (s == "copy") 1151 return String.valueOf((char)169); 1152 else if (s == "laquo") 1153 return String.valueOf((char)171); 1154 else if (s == "reg") return "(R)"; 1156 else if (s == "acute") 1157 return String.valueOf((char)180); 1158 else if (s == "auml") 1159 return String.valueOf((char)228); 1160 else if (s == "middot") 1161 return String.valueOf((char)183); 1162 else if (s == "raquo") 1163 return String.valueOf((char)187); 1164 else if (s == "eacute") 1165 return String.valueOf((char)233); 1166 else if (s == "iuml") 1167 return String.valueOf((char)239); 1168 else if (s == "bull") 1169 return String.valueOf((char)8226); 1170 else if (s == "AElig") 1171 return "AE"; 1172 else if (s == "amp") 1173 return "&"; 1174 else if (s == "lt") 1175 return "<"; 1176 else if (s == "gt") 1177 return ">"; 1178 else 1179 return entity; 1180 } 1181 1182 private void skipComment() 1183 { 1184 FastStringBuffer sb = new FastStringBuffer(); 1185 try { 1186 int c; 1187 while ((c = reader.read()) >= 0) { 1188 if (c != '\r') 1189 ++sourceOffset; 1190 sb.append((char) c); 1191 if (c == '>' && sb.toString().endsWith("-->")) 1192 return; 1193 } 1194 } 1195 catch (IOException e){ 1196 Log.error(e); 1197 } 1198 } 1199 1200 private void skipTag(String tagName) 1201 { 1202 try { 1203 int c; 1204 while ((c = reader.read()) >= 0) { 1205 if (c != '\r') 1206 ++sourceOffset; 1207 if (c == '<') { 1208 String tag = gatherTag(); 1209 if (isTag(tag, tagName)) 1210 return; 1211 } 1212 } 1213 } 1214 catch (IOException e) { 1215 Log.error(e); 1216 } 1217 } 1218 1219 private void skipScript() 1220 { 1221 try { 1222 int c; 1223 while ((c = reader.read()) >= 0) { 1224 if (c != '\r') 1225 ++sourceOffset; 1226 if (c == '<') { 1227 if (readEndScriptTag()) 1228 return; 1229 } 1230 } 1231 } 1232 catch (IOException e) { 1233 Log.error(e); 1234 } 1235 } 1236 1237 private boolean readEndScriptTag() 1238 { 1239 final String s = "</script>"; 1240 final int length = s.length(); 1241 FastStringBuffer sb = new FastStringBuffer('<'); 1242 try { 1243 int c; 1244 while ((c = reader.read()) >= 0) { 1245 if (c != '\r') 1246 ++sourceOffset; 1247 sb.append(Character.toLowerCase((char)c)); 1248 if (sb.length() < length) { 1249 if (!s.startsWith(sb.toString())) 1250 return false; 1251 } else 1252 return s.equals(sb.toString()); 1253 } 1254 } 1255 catch (IOException e) { 1256 Log.error(e); 1257 } 1258 return false; 1259 } 1260 1261 private void doText(String s) 1262 { 1263 final int length = s.length(); 1264 for (int i = 0; i < length; i++) 1265 doChar(s.charAt(i)); 1266 } 1267 1268 private void doChar(char c) 1269 { 1270 if (preformatted) { 1271 switch (c) { 1272 case '\t': 1273 final int spaces = 8 - getCurrentOffset() % 8; 1274 for (int i = spaces-1; i >= 0; i--) 1275 textBuffer.append(' '); 1276 break; 1277 case '\r': 1278 break; 1279 case '\n': 1280 flushSegment(); 1281 if (segments != null) { 1282 lines.appendLine(new WebLine(segments, sourceOffset)); 1283 segments = null; 1284 } else 1285 lines.appendLine(new WebLine(sourceOffset)); 1286 ++offset; break; 1288 default: 1289 textBuffer.append(c); 1290 break; 1291 } 1292 return; 1293 } 1294 1295 switch (c) { 1296 case 133: textBuffer.append("..."); 1298 break; 1299 case 145: case 146: textBuffer.append('\''); 1302 break; 1303 case 147: case 148: textBuffer.append('"'); 1306 break; 1307 case 149: textBuffer.append((char)8226); 1309 break; 1310 case 150: 1311 textBuffer.append('-'); 1313 break; 1314 case 151: 1315 textBuffer.append("--"); 1317 break; 1318 case 153: 1319 textBuffer.append("(TM)"); 1320 break; 1321 case '\n': 1322 case '\t': 1323 case ' ': 1324 if (textBuffer.length() > 0) { 1327 char preceding = textBuffer.charAt(textBuffer.length() - 1); 1328 if (preceding != ' ' && preceding != 160) 1329 textBuffer.append(' '); 1330 } else if (segments != null && segments.size() > 0) { 1331 HtmlLineSegment seg = (HtmlLineSegment) segments.getLastSegment(); 1333 String s = seg.getText(); 1334 if (s.length() == 0) 1335 textBuffer.append(' '); 1336 else { 1337 char preceding = s.charAt(s.length() - 1); 1338 if (preceding != ' ' && preceding != 160) 1339 textBuffer.append(' '); 1340 } 1341 } 1342 break; 1343 case '\r': 1344 break; 1345 default: 1346 maybeIndent(); 1349 textBuffer.append(c); 1350 break; 1351 } 1352 1353 if (Character.isWhitespace(c)) 1354 maybeWrap(); 1355 } 1356 1357 private void maybeIndent() 1358 { 1359 if (indentLevel > 0) { 1360 if (segments == null && textBuffer.length() == 0) { 1361 textBuffer.append(Utilities.spaces(getIndent())); 1362 flushSegment(null, FORMAT_WHITESPACE); 1363 } 1364 } 1365 } 1366 1367 private final int getIndent() 1368 { 1369 return indentLevel * 4; 1370 } 1371 1372 private int getCurrentOffset() 1373 { 1374 int currentOffset = 0; 1375 if (segments != null) { 1376 for (int i = segments.size()-1; i >= 0; i--) 1377 currentOffset += segments.getSegment(i).length(); 1378 } 1379 currentOffset += textBuffer.length(); 1380 return currentOffset; 1381 } 1382 1383 private final void flushSegment() 1384 { 1385 flushSegment(true); 1386 } 1387 1388 private void flushSegment(boolean wrap) 1389 { 1390 if (textBuffer.length() > 0) { 1391 if (wrap) 1392 maybeWrap(); 1393 int format = 0; 1394 if (link != null) 1395 format |= FORMAT_LINK; 1396 if (bold || strong || heading) 1397 format |= FORMAT_BOLD; 1398 if (italic || emphasis) 1399 format |= FORMAT_ITALIC; 1400 if (whitespace) 1401 format |= FORMAT_WHITESPACE; 1402 if (segments == null) 1403 segments = new LineSegmentList(); 1404 segments.addSegment(new HtmlLineSegment(textBuffer.toString(), format, link)); 1405 offset += textBuffer.length(); 1406 textBuffer.setLength(0); 1407 } 1408 } 1409 1410 private void flushSegment(Link link, int format) 1411 { 1412 if (textBuffer.length() > 0) { 1413 if (segments == null) 1414 segments = new LineSegmentList(); 1415 segments.addSegment(new HtmlLineSegment(textBuffer.toString(), format, link)); 1416 offset += textBuffer.length(); 1417 textBuffer.setLength(0); 1418 } 1419 } 1420 1421 private void maybeWrap() 1422 { 1423 if (preformatted) 1424 return; 1425 int currentOffset = getCurrentOffset(); 1426 if (currentOffset > maxChars()) { 1427 int length = textBuffer.length(); 1428 1429 int preceding = currentOffset - length; 1431 1432 final String text = textBuffer.toString(); 1433 int index = text.lastIndexOf(' '); 1434 while (index >= 0 && preceding + index > maxChars()) 1435 index = text.lastIndexOf(' ', index - 1); 1436 1437 if (index >= 0) { 1438 String remainder = text.substring(index + 1); 1440 textBuffer.setLength(index); flushSegment(false); if (segments != null) { 1443 lines.appendLine(new WebLine(segments, sourceOffset)); 1444 ++offset; segments = null; 1446 } 1447 maybeIndent(); 1448 textBuffer.append(remainder); 1449 } else { 1450 textBuffer.setLength(0); 1452 if (segments != null) { 1453 final int last = segments.size() - 1; 1454 if (last >= 0) { 1455 final HtmlLineSegment lastSegment = (HtmlLineSegment) segments.getSegment(last); 1456 final String segmentText = lastSegment.getText(); 1457 index = segmentText.lastIndexOf(' '); 1458 if (index >= 0) { 1459 final String head = segmentText.substring(0, index); 1461 final String tail = segmentText.substring(index + 1); 1462 1463 --offset; 1466 1467 final int format = lastSegment.getFormat(); 1468 final Link link = lastSegment.getLink(); 1469 1470 segments.setSegment(last, new HtmlLineSegment(head, format, link)); 1471 lines.appendLine(new WebLine(segments, sourceOffset)); 1472 1473 ++offset; 1475 1476 segments = null; 1477 if (tail.length() > 0) { 1478 maybeIndent(); 1479 if (segments == null) 1480 segments = new LineSegmentList(); 1481 segments.addSegment(new HtmlLineSegment(tail, format, link)); 1482 } 1483 } else { 1484 segments.removeSegment(lastSegment); 1486 lines.appendLine(new WebLine(segments, sourceOffset)); 1487 1488 ++offset; 1490 1491 segments = null; 1492 maybeIndent(); 1493 if (segments == null) 1494 segments = new LineSegmentList(); 1495 segments.addSegment(lastSegment); 1496 } 1497 } 1498 } 1499 1500 maybeIndent(); 1501 textBuffer.append(text); 1502 flushSegment(false); } 1504 } 1505 } 1506 1507 private boolean flushLine() 1509 { 1510 flushSegment(); 1511 if (centered() && currentTable == null && segments != null) { 1512 int length = getCurrentOffset(); 1513 if (maxChars() > length) { 1514 int numSpaces = (maxChars() - length) / 2; 1515 if (numSpaces > 0) { 1516 segments.addSegment(0, new HtmlLineSegment(Utilities.spaces(numSpaces), 1517 FORMAT_WHITESPACE, null)); 1518 offset += numSpaces; 1519 } 1520 } 1521 } 1522 if (segments != null) { 1523 lines.appendLine(new WebLine(segments, sourceOffset)); 1524 ++offset; segments = null; 1526 return true; 1527 } else 1528 return false; 1529 } 1530 1531 private void newLine() 1532 { 1533 flushLine(); 1534 Line lastLine = lines.getLastLine(); 1535 if (lastLine != null && lastLine.length() > 0 && !lastLine.isBlank()) { 1536 lines.appendLine(new WebLine(sourceOffset)); 1537 ++offset; 1538 } 1539 } 1540 1541 private final int maxChars() 1542 { 1543 Debug.assertTrue(maxChars == 80); 1558 return maxChars; 1559 } 1560 1561 private static class EncodingChangeException extends Exception 1562 { 1563 private String newEncoding; 1564 1565 EncodingChangeException(String newEncoding) 1566 { 1567 this.newEncoding = newEncoding; 1568 } 1569 1570 String getNewEncoding() 1571 { 1572 return newEncoding; 1573 } 1574 } 1575} 1576 | Popular Tags |