1 7 8 package javax.swing.text.html.parser; 9 10 import javax.swing.text.SimpleAttributeSet ; 11 import javax.swing.text.html.HTML ; 12 import javax.swing.text.ChangedCharSetException ; 13 import java.io.*; 14 import java.util.Hashtable ; 15 import java.util.Properties ; 16 import java.util.Vector ; 17 import java.util.Enumeration ; 18 import java.net.URL ; 19 20 import sun.misc.MessageUtils; 21 22 63 public 64 class Parser implements DTDConstants { 65 66 private char text[] = new char[1024]; 67 private int textpos = 0; 68 private TagElement last; 69 private boolean space; 70 71 private char str[] = new char[128]; 72 private int strpos = 0; 73 74 protected DTD dtd = null; 75 76 private int ch; 77 private int ln; 78 private Reader in; 79 80 private Element recent; 81 private TagStack stack; 82 private boolean skipTag = false; 83 private TagElement lastFormSent = null; 84 private SimpleAttributeSet attributes = new SimpleAttributeSet (); 85 86 private boolean seenHtml = false; 92 private boolean seenHead = false; 93 private boolean seenBody = false; 94 95 115 private boolean ignoreSpace; 116 117 124 protected boolean strict = false; 125 126 127 128 private int crlfCount; 129 130 private int crCount; 131 132 private int lfCount; 133 134 143 private int currentBlockStartPos; 144 145 private int lastBlockStartPos; 146 147 151 private static final char[] cp1252Map = { 152 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 142, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 158, 376 }; 183 184 public Parser(DTD dtd) { 185 this.dtd = dtd; 186 } 187 188 189 192 protected int getCurrentLine() { 193 return ln; 194 } 195 196 203 int getBlockStartPosition() { 204 return Math.max(0, lastBlockStartPos - 1); 205 } 206 207 210 protected TagElement makeTag(Element elem, boolean fictional) { 211 return new TagElement (elem, fictional); 212 } 213 214 protected TagElement makeTag(Element elem) { 215 return makeTag(elem, false); 216 } 217 218 protected SimpleAttributeSet getAttributes() { 219 return attributes; 220 } 221 222 protected void flushAttributes() { 223 attributes.removeAttributes(attributes); 224 } 225 226 229 protected void handleText(char text[]) { 230 } 231 232 235 protected void handleTitle(char text[]) { 236 handleText(text); 239 } 240 241 244 protected void handleComment(char text[]) { 245 } 246 247 protected void handleEOFInComment() { 248 254 int commentEndPos = strIndexOf('\n'); 255 if (commentEndPos >= 0) { 256 handleComment(getChars(0, commentEndPos)); 257 try { 258 in.close(); 259 in = new CharArrayReader(getChars(commentEndPos + 1)); 260 ch = '>'; 261 } catch (IOException e) { 262 error("ioexception"); 263 } 264 265 resetStrBuffer(); 266 } else { 267 error("eof.comment"); 269 } 270 } 271 272 275 protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { 276 } 277 278 281 protected void handleStartTag(TagElement tag) { 282 } 283 284 287 protected void handleEndTag(TagElement tag) { 288 } 289 290 293 protected void handleError(int ln, String msg) { 294 300 } 301 302 305 void handleText(TagElement tag) { 306 if (tag.breaksFlow()) { 307 space = false; 308 if (!strict) { 309 ignoreSpace = true; 310 } 311 } 312 if (textpos == 0) { 313 if ((!space) || (stack == null) || last.breaksFlow() || 314 !stack.advance(dtd.pcdata)) { 315 last = tag; 316 space = false; 317 lastBlockStartPos = currentBlockStartPos; 318 return; 319 } 320 } 321 if (space) { 322 if (!ignoreSpace) { 323 if (textpos + 1 > text.length) { 325 char newtext[] = new char[text.length + 200]; 326 System.arraycopy(text, 0, newtext, 0, text.length); 327 text = newtext; 328 } 329 330 text[textpos++] = ' '; 332 if (!strict && !tag.getElement().isEmpty()) { 333 ignoreSpace = true; 334 } 335 } 336 space = false; 337 } 338 char newtext[] = new char[textpos]; 339 System.arraycopy(text, 0, newtext, 0, textpos); 340 if (tag.getElement().getName().equals("title")) { 343 handleTitle(newtext); 344 } else { 345 handleText(newtext); 346 } 347 lastBlockStartPos = currentBlockStartPos; 348 textpos = 0; 349 last = tag; 350 space = false; 351 } 352 353 356 protected void error(String err, String arg1, String arg2, 357 String arg3) { 358 handleError (ln, err + arg1 + arg2 + arg3); 360 } 361 362 protected void error(String err, String arg1, String arg2) { 363 error(err, arg1, arg2, "?"); 364 } 365 protected void error(String err, String arg1) { 366 error(err, arg1, "?", "?"); 367 } 368 protected void error(String err) { 369 error(err, "?", "?", "?"); 370 } 371 372 373 378 protected void startTag(TagElement tag) throws ChangedCharSetException { 379 Element elem = tag.getElement(); 380 381 if (!elem.isEmpty() || textpos != 0) { 387 handleText(tag); 388 } else { 389 last = tag; 394 space = false; 397 } 398 lastBlockStartPos = currentBlockStartPos; 399 400 for (AttributeList a = elem.atts ; a != null ; a = a.next) { 402 403 if ((a.modifier == REQUIRED) && ((attributes.isEmpty()) || (!attributes.isDefined(a.name)))) { 404 error("req.att ", a.getName(), elem.getName()); 405 } 406 } 407 408 if (elem.isEmpty()) { 409 handleEmptyTag(tag); 410 414 } else { 415 recent = elem; 416 stack = new TagStack (tag, stack); 417 handleStartTag(tag); 418 } 419 } 420 421 425 protected void endTag(boolean omitted) { 426 handleText(stack.tag); 427 428 if (omitted && !stack.elem.omitEnd()) { 429 error("end.missing", stack.elem.getName()); 430 } else if (!stack.terminate()) { 431 error("end.unexpected", stack.elem.getName()); 432 } 433 434 handleEndTag(stack.tag); 436 stack = stack.next; 437 recent = (stack != null) ? stack.elem : null; 438 } 439 440 441 boolean ignoreElement(Element elem) { 442 443 String stackElement = stack.elem.getName(); 444 String elemName = elem.getName(); 445 451 if ((elemName.equals("html") && seenHtml) || 452 (elemName.equals("head") && seenHead) || 453 (elemName.equals("body") && seenBody)) { 454 return true; 455 } 456 if (elemName.equals("dt") || elemName.equals("dd")) { 457 TagStack s = stack; 458 while (s != null && !s.elem.getName().equals("dl")) { 459 s = s.next; 460 } 461 if (s == null) { 462 return true; 463 } 464 } 465 466 if (((stackElement.equals("table")) && 467 (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) || 468 ((elemName.equals("font")) && 469 (stackElement.equals("ul") || stackElement.equals("ol"))) || 470 (elemName.equals("meta") && stack != null) || 471 (elemName.equals("style") && seenBody) || 472 (stackElement.equals("table") && elemName.equals("a"))) { 473 return true; 474 } 475 return false; 476 } 477 478 479 482 483 protected void markFirstTime(Element elem) { 484 String elemName = elem.getName(); 485 if (elemName.equals("html")) { 486 seenHtml = true; 487 } else if (elemName.equals("head")) { 488 seenHead = true; 489 } else if (elemName.equals("body")) { 490 if (buf.length == 1) { 491 char[] newBuf = new char[256]; 493 494 newBuf[0] = buf[0]; 495 buf = newBuf; 496 } 497 seenBody = true; 498 } 499 } 500 501 504 boolean legalElementContext(Element elem) throws ChangedCharSetException { 505 506 508 if (stack == null) { 510 if (elem != dtd.html) { 512 startTag(makeTag(dtd.html, true)); 514 return legalElementContext(elem); 515 } 516 return true; 517 } 518 519 if (stack.advance(elem)) { 521 markFirstTime(elem); 523 return true; 524 } 525 boolean insertTag = false; 526 527 String stackElemName = stack.elem.getName(); 553 String elemName = elem.getName(); 554 555 556 if (!strict && 557 ((stackElemName.equals("table") && elemName.equals("td")) || 558 (stackElemName.equals("table") && elemName.equals("th")) || 559 (stackElemName.equals("tr") && !elemName.equals("tr")))){ 560 insertTag = true; 561 } 562 563 564 if (!strict && !insertTag && (stack.elem.getName() != elem.getName() || 565 elem.getName().equals("body"))) { 566 if (skipTag = ignoreElement(elem)) { 567 error("tag.ignore", elem.getName()); 568 return skipTag; 569 } 570 } 571 572 if (!strict && stackElemName.equals("table") && 576 !elemName.equals("tr") && !elemName.equals("td") && 577 !elemName.equals("th") && !elemName.equals("caption")) { 578 Element e = dtd.getElement("tr"); 579 TagElement t = makeTag(e, true); 580 legalTagContext(t); 581 startTag(t); 582 error("start.missing", elem.getName()); 583 return legalElementContext(elem); 584 } 585 586 if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) { 595 for (TagStack s = stack.next ; s != null ; s = s.next) { 596 if (s.advance(elem)) { 597 while (stack != s) { 598 endTag(true); 599 } 600 return true; 601 } 602 if (!s.terminate() || (strict && !s.elem.omitEnd())) { 603 break; 604 } 605 } 606 } 607 608 Element next = stack.first(); 613 if (next != null && (!strict || next.omitStart()) && 614 !(next==dtd.head && elem==dtd.pcdata) ) { 615 TagElement t = makeTag(next, true); 617 legalTagContext(t); 618 startTag(t); 619 if (!next.omitStart()) { 620 error("start.missing", elem.getName()); 621 } 622 return legalElementContext(elem); 623 } 624 625 626 630 if (!strict) { 631 ContentModel content = stack.contentModel(); 632 Vector elemVec = new Vector (); 633 if (content != null) { 634 content.getElements(elemVec); 635 for (Enumeration v = elemVec.elements(); v.hasMoreElements();) { 636 Element e = (Element )v.nextElement(); 637 638 if (stack.excluded(e.getIndex())) { 642 continue; 643 } 644 645 boolean reqAtts = false; 646 647 for (AttributeList a = e.getAttributes(); a != null ; a = a.next) { 648 if (a.modifier == REQUIRED) { 649 reqAtts = true; 650 break; 651 } 652 } 653 if (reqAtts) { 657 continue; 658 } 659 660 ContentModel m = e.getContent(); 661 if (m != null && m.first(elem)) { 662 TagElement t = makeTag(e, true); 664 legalTagContext(t); 665 startTag(t); 666 error("start.missing", e.getName()); 667 return legalElementContext(elem); 668 } 669 } 670 } 671 } 672 673 if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) { 678 if (!stack.elem.omitEnd()) { 680 error("end.missing", elem.getName()); 681 } 682 683 endTag(true); 684 return legalElementContext(elem); 685 } 686 687 return false; 689 } 690 691 694 void legalTagContext(TagElement tag) throws ChangedCharSetException { 695 if (legalElementContext(tag.getElement())) { 696 markFirstTime(tag.getElement()); 697 return; 698 } 699 700 if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) { 702 endTag(true); 703 legalTagContext(tag); 704 return; 705 } 706 707 for (TagStack s = stack ; s != null ; s = s.next) { 709 if (s.tag.getElement() == dtd.head) { 710 while (stack != s) { 711 endTag(true); 712 } 713 endTag(true); 714 legalTagContext(tag); 715 return; 716 } 717 } 718 719 error("tag.unexpected", tag.getElement().getName()); 721 } 722 723 727 void errorContext() throws ChangedCharSetException { 728 for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) { 729 handleEndTag(stack.tag); 730 } 731 if (stack == null) { 732 legalElementContext(dtd.body); 733 startTag(makeTag(dtd.body, true)); 734 } 735 } 736 737 740 void addString(int c) { 741 if (strpos == str.length) { 742 char newstr[] = new char[str.length + 128]; 743 System.arraycopy(str, 0, newstr, 0, str.length); 744 str = newstr; 745 } 746 str[strpos++] = (char)c; 747 } 748 749 752 String getString(int pos) { 753 char newStr[] = new char[strpos - pos]; 754 System.arraycopy(str, pos, newStr, 0, strpos - pos); 755 strpos = pos; 756 return new String (newStr); 757 } 758 759 char[] getChars(int pos) { 760 char newStr[] = new char[strpos - pos]; 761 System.arraycopy(str, pos, newStr, 0, strpos - pos); 762 strpos = pos; 763 return newStr; 764 } 765 766 char[] getChars(int pos, int endPos) { 767 char newStr[] = new char[endPos - pos]; 768 System.arraycopy(str, pos, newStr, 0, endPos - pos); 769 return newStr; 772 } 773 774 void resetStrBuffer() { 775 strpos = 0; 776 } 777 778 int strIndexOf(char target) { 779 for (int i = 0; i < strpos; i++) { 780 if (str[i] == target) { 781 return i; 782 } 783 } 784 785 return -1; 786 } 787 788 792 void skipSpace() throws IOException { 793 while (true) { 794 switch (ch) { 795 case '\n': 796 ln++; 797 ch = readCh(); 798 lfCount++; 799 break; 800 801 case '\r': 802 ln++; 803 if ((ch = readCh()) == '\n') { 804 ch = readCh(); 805 crlfCount++; 806 } 807 else { 808 crCount++; 809 } 810 break; 811 case ' ': 812 case '\t': 813 ch = readCh(); 814 break; 815 816 default: 817 return; 818 } 819 } 820 } 821 822 827 boolean parseIdentifier(boolean lower) throws IOException { 828 switch (ch) { 829 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 830 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 831 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 832 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 833 case 'Y': case 'Z': 834 if (lower) { 835 ch = 'a' + (ch - 'A'); 836 } 837 838 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 839 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 840 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 841 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 842 case 'y': case 'z': 843 break; 844 845 default: 846 return false; 847 } 848 849 while (true) { 850 addString(ch); 851 852 switch (ch = readCh()) { 853 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 854 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 855 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 856 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 857 case 'Y': case 'Z': 858 if (lower) { 859 ch = 'a' + (ch - 'A'); 860 } 861 862 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 863 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 864 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 865 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 866 case 'y': case 'z': 867 868 case '0': case '1': case '2': case '3': case '4': 869 case '5': case '6': case '7': case '8': case '9': 870 871 case '.': case '-': 872 873 case '_': break; 875 876 default: 877 return true; 878 } 879 } 880 } 881 882 885 private char[] parseEntityReference() throws IOException { 886 int pos = strpos; 887 888 if ((ch = readCh()) == '#') { 889 int n = 0; 890 ch = readCh(); 891 if ((ch >= '0') && (ch <= '9') || 892 ch == 'x' || ch == 'X') { 893 894 if ((ch >= '0') && (ch <= '9')) { 895 while ((ch >= '0') && (ch <= '9')) { 897 n = (n * 10) + ch - '0'; 898 ch = readCh(); 899 } 900 } else { 901 ch = readCh(); 903 char lch = (char) Character.toLowerCase(ch); 904 while ((lch >= '0') && (lch <= '9') || 905 (lch >= 'a') && (lch <= 'f')) { 906 if (lch >= '0' && lch <= '9') { 907 n = (n * 16) + lch - '0'; 908 } else { 909 n = (n * 16) + lch - 'a' + 10; 910 } 911 ch = readCh(); 912 lch = (char) Character.toLowerCase(ch); 913 } 914 } 915 switch (ch) { 916 case '\n': 917 ln++; 918 ch = readCh(); 919 lfCount++; 920 break; 921 922 case '\r': 923 ln++; 924 if ((ch = readCh()) == '\n') { 925 ch = readCh(); 926 crlfCount++; 927 } 928 else { 929 crCount++; 930 } 931 break; 932 933 case ';': 934 ch = readCh(); 935 break; 936 } 937 char data[] = {mapNumericReference((char) n)}; 938 return data; 939 } 940 addString('#'); 941 if (!parseIdentifier(false)) { 942 error("ident.expected"); 943 strpos = pos; 944 char data[] = {'&', '#'}; 945 return data; 946 } 947 } else if (!parseIdentifier(false)) { 948 char data[] = {'&'}; 949 return data; 950 } 951 switch (ch) { 952 case '\n': 953 ln++; 954 ch = readCh(); 955 lfCount++; 956 break; 957 958 case '\r': 959 ln++; 960 if ((ch = readCh()) == '\n') { 961 ch = readCh(); 962 crlfCount++; 963 } 964 else { 965 crCount++; 966 } 967 break; 968 969 case ';': 970 ch = readCh(); 971 break; 972 } 973 974 String nm = getString(pos); 975 Entity ent = dtd.getEntity(nm); 976 977 if (!strict && (ent == null)) { 982 ent = dtd.getEntity(nm.toLowerCase()); 983 } 984 if ((ent == null) || !ent.isGeneral()) { 985 986 if (nm.length() == 0) { 987 error("invalid.entref", nm); 988 return new char[0]; 989 } 990 991 String str = "&" + nm; 992 993 char b[] = new char[str.length()]; 994 str.getChars(0, b.length, b, 0); 995 return b; 996 } 997 return ent.getData(); 998 } 999 1000 1012 private char mapNumericReference(char c) { 1013 if (c < 130 || c > 159) { 1014 return c; 1015 } 1016 return cp1252Map[c - 130]; 1017 } 1018 1019 1022 void parseComment() throws IOException { 1023 1024 while (true) { 1025 int c = ch; 1026 switch (c) { 1027 case '-': 1028 1038 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) { 1039 if ((ch = readCh()) == '>') { 1040 return; 1041 } 1042 if (ch == '!') { 1043 if ((ch = readCh()) == '>') { 1044 return; 1045 } else { 1046 1047 addString('-'); 1048 addString('!'); 1049 continue; 1050 } 1051 } 1052 break; 1053 } 1054 1055 if ((ch = readCh()) == '-') { 1056 ch = readCh(); 1057 if (strict || ch == '>') { 1058 return; 1059 } 1060 if (ch == '!') { 1061 if ((ch = readCh()) == '>') { 1062 return; 1063 } else { 1064 1065 addString('-'); 1066 addString('!'); 1067 continue; 1068 } 1069 } 1070 1071 addString('-'); 1072 } 1073 break; 1074 1075 case -1: 1076 handleEOFInComment(); 1077 return; 1078 1079 case '\n': 1080 ln++; 1081 ch = readCh(); 1082 lfCount++; 1083 break; 1084 1085 case '>': 1086 ch = readCh(); 1087 break; 1088 1089 case '\r': 1090 ln++; 1091 if ((ch = readCh()) == '\n') { 1092 ch = readCh(); 1093 crlfCount++; 1094 } 1095 else { 1096 crCount++; 1097 } 1098 c = '\n'; 1099 break; 1100 default: 1101 ch = readCh(); 1102 break; 1103 } 1104 1105 addString(c); 1106 } 1107 } 1108 1109 1112 void parseLiteral(boolean replace) throws IOException { 1113 while (true) { 1114 int c = ch; 1115 switch (c) { 1116 case -1: 1117 error("eof.literal", stack.elem.getName()); 1118 endTag(true); 1119 return; 1120 1121 case '>': 1122 ch = readCh(); 1123 int i = textpos - (stack.elem.name.length() + 2), j = 0; 1124 1125 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) { 1127 while ((++i < textpos) && 1128 (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++))); 1129 if (i == textpos) { 1130 textpos -= (stack.elem.name.length() + 2); 1131 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1132 textpos--; 1133 } 1134 endTag(false); 1135 return; 1136 } 1137 } 1138 break; 1139 1140 case '&': 1141 char data[] = parseEntityReference(); 1142 if (textpos + data.length > text.length) { 1143 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 1144 System.arraycopy(text, 0, newtext, 0, text.length); 1145 text = newtext; 1146 } 1147 System.arraycopy(data, 0, text, textpos, data.length); 1148 textpos += data.length; 1149 continue; 1150 1151 case '\n': 1152 ln++; 1153 ch = readCh(); 1154 lfCount++; 1155 break; 1156 1157 case '\r': 1158 ln++; 1159 if ((ch = readCh()) == '\n') { 1160 ch = readCh(); 1161 crlfCount++; 1162 } 1163 else { 1164 crCount++; 1165 } 1166 c = '\n'; 1167 break; 1168 default: 1169 ch = readCh(); 1170 break; 1171 } 1172 1173 if (textpos == text.length) { 1175 char newtext[] = new char[text.length + 128]; 1176 System.arraycopy(text, 0, newtext, 0, text.length); 1177 text = newtext; 1178 } 1179 text[textpos++] = (char)c; 1180 } 1181 } 1182 1183 1186 String parseAttributeValue(boolean lower) throws IOException { 1187 int delim = -1; 1188 1189 switch(ch) { 1191 case '\'': 1192 case '"': 1193 delim = ch; 1194 ch = readCh(); 1195 break; 1196 } 1197 1198 while (true) { 1200 int c = ch; 1201 1202 switch (c) { 1203 case '\n': 1204 ln++; 1205 ch = readCh(); 1206 lfCount++; 1207 if (delim < 0) { 1208 return getString(0); 1209 } 1210 break; 1211 1212 case '\r': 1213 ln++; 1214 1215 if ((ch = readCh()) == '\n') { 1216 ch = readCh(); 1217 crlfCount++; 1218 } 1219 else { 1220 crCount++; 1221 } 1222 if (delim < 0) { 1223 return getString(0); 1224 } 1225 break; 1226 1227 case '\t': 1228 if (delim < 0) 1229 c = ' '; 1230 case ' ': 1231 ch = readCh(); 1232 if (delim < 0) { 1233 return getString(0); 1234 } 1235 break; 1236 1237 case '>': 1238 case '<': 1239 if (delim < 0) { 1240 return getString(0); 1241 } 1242 ch = readCh(); 1243 break; 1244 1245 case '\'': 1246 case '"': 1247 ch = readCh(); 1248 if (c == delim) { 1249 return getString(0); 1250 } else if (delim == -1) { 1251 error("attvalerr"); 1252 if (strict || ch == ' ') { 1253 return getString(0); 1254 } else { 1255 continue; 1256 } 1257 } 1258 break; 1259 1260 case '=': 1261 if (delim < 0) { 1262 1266 error("attvalerr"); 1267 1270 if (strict) { 1271 return getString(0); 1272 } 1273 } 1274 ch = readCh(); 1275 break; 1276 1277 case '&': 1278 if (strict && delim < 0) { 1279 ch = readCh(); 1280 break; 1281 } 1282 1283 char data[] = parseEntityReference(); 1284 for (int i = 0 ; i < data.length ; i++) { 1285 c = data[i]; 1286 addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c); 1287 } 1288 continue; 1289 1290 case -1: 1291 return getString(0); 1292 1293 default: 1294 if (lower && (c >= 'A') && (c <= 'Z')) { 1295 c = 'a' + c - 'A'; 1296 } 1297 ch = readCh(); 1298 break; 1299 } 1300 addString(c); 1301 } 1302 } 1303 1304 1305 1308 void parseAttributeSpecificationList(Element elem) throws IOException { 1309 1310 while (true) { 1311 skipSpace(); 1312 1313 switch (ch) { 1314 case '/': 1315 case '>': 1316 case '<': 1317 case -1: 1318 return; 1319 1320 case '-': 1321 if ((ch = readCh()) == '-') { 1322 ch = readCh(); 1323 parseComment(); 1324 strpos = 0; 1325 } else { 1326 error("invalid.tagchar", "-", elem.getName()); 1327 ch = readCh(); 1328 } 1329 continue; 1330 } 1331 1332 AttributeList att = null; 1333 String attname = null; 1334 String attvalue = null; 1335 1336 if (parseIdentifier(true)) { 1337 attname = getString(0); 1338 skipSpace(); 1339 if (ch == '=') { 1340 ch = readCh(); 1341 skipSpace(); 1342 att = elem.getAttribute(attname); 1343 attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME)); 1348 } else { 1350 attvalue = attname; 1351 att = elem.getAttributeByValue(attvalue); 1352 if (att == null) { 1353 att = elem.getAttribute(attname); 1354 if (att != null) { 1355 attvalue = att.getValue(); 1356 } 1357 else { 1358 attvalue = null; 1361 } 1362 } 1363 } 1364 } else if (!strict && ch == ',') { ch = readCh(); 1366 continue; 1367 } else if (!strict && ch == '"') { ch = readCh(); 1369 skipSpace(); 1370 if (parseIdentifier(true)) { 1371 attname = getString(0); 1372 if (ch == '"') { 1373 ch = readCh(); 1374 } 1375 skipSpace(); 1376 if (ch == '=') { 1377 ch = readCh(); 1378 skipSpace(); 1379 att = elem.getAttribute(attname); 1380 attvalue = parseAttributeValue((att != null) && 1381 (att.type != CDATA) && 1382 (att.type != NOTATION)); 1383 } else { 1384 attvalue = attname; 1385 att = elem.getAttributeByValue(attvalue); 1386 if (att == null) { 1387 att = elem.getAttribute(attname); 1388 if (att != null) { 1389 attvalue = att.getValue(); 1390 } 1391 } 1392 } 1393 } else { 1394 char str[] = {(char)ch}; 1395 error("invalid.tagchar", new String (str), elem.getName()); 1396 ch = readCh(); 1397 continue; 1398 } 1399 } else if (!strict && (attributes.isEmpty()) && (ch == '=')) { 1400 ch = readCh(); 1401 skipSpace(); 1402 attname = elem.getName(); 1403 att = elem.getAttribute(attname); 1404 attvalue = parseAttributeValue((att != null) && 1405 (att.type != CDATA) && 1406 (att.type != NOTATION)); 1407 } else if (!strict && (ch == '=')) { 1408 ch = readCh(); 1409 skipSpace(); 1410 attvalue = parseAttributeValue(true); 1411 error("attvalerr"); 1412 return; 1413 } else { 1414 char str[] = {(char)ch}; 1415 error("invalid.tagchar", new String (str), elem.getName()); 1416 if (!strict) { 1417 ch = readCh(); 1418 continue; 1419 } else { 1420 return; 1421 } 1422 } 1423 1424 if (att != null) { 1425 attname = att.getName(); 1426 } else { 1427 error("invalid.tagatt", attname, elem.getName()); 1428 } 1429 1430 if (attributes.isDefined(attname)) { 1432 error("multi.tagatt", attname, elem.getName()); 1433 } 1434 if (attvalue == null) { 1435 attvalue = ((att != null) && (att.value != null)) ? att.value : 1436 HTML.NULL_ATTRIBUTE_VALUE; 1437 } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) { 1438 error("invalid.tagattval", attname, elem.getName()); 1439 } 1440 HTML.Attribute attkey = HTML.getAttributeKey(attname); 1441 if (attkey == HTML.Attribute.CLASS) { 1442 attvalue = attvalue.toLowerCase(); 1443 } 1444 if (attkey == null) { 1445 attributes.addAttribute(attname, attvalue); 1446 } else { 1447 attributes.addAttribute(attkey, attvalue); 1448 } 1449 } 1450 } 1451 1452 1456 public String parseDTDMarkup() throws IOException { 1457 1458 StringBuffer strBuff = new StringBuffer (); 1459 ch = readCh(); 1460 while(true) { 1461 switch (ch) { 1462 case '>': 1463 ch = readCh(); 1464 return strBuff.toString(); 1465 case -1: 1466 error("invalid.markup"); 1467 return strBuff.toString(); 1468 case '\n': 1469 ln++; 1470 ch = readCh(); 1471 lfCount++; 1472 break; 1473 case '"': 1474 ch = readCh(); 1475 break; 1476 case '\r': 1477 ln++; 1478 if ((ch = readCh()) == '\n') { 1479 ch = readCh(); 1480 crlfCount++; 1481 } 1482 else { 1483 crCount++; 1484 } 1485 break; 1486 default: 1487 strBuff.append((char)(ch & 0xFF)); 1488 ch = readCh(); 1489 break; 1490 } 1491 } 1492 } 1493 1494 1499 protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException { 1500 1501 1502 if ((strBuff.length() == "DOCTYPE".length()) && 1503 (strBuff.toString().toUpperCase().equals("DOCTYPE"))) { 1504 parseDTDMarkup(); 1505 return true; 1506 } 1507 return false; 1508 } 1509 1510 1513 void parseInvalidTag() throws IOException { 1514 while (true) { 1516 skipSpace(); 1517 switch (ch) { 1518 case '>': 1519 case -1: 1520 ch = readCh(); 1521 return; 1522 case '<': 1523 return; 1524 default: 1525 ch = readCh(); 1526 1527 } 1528 } 1529 } 1530 1531 1534 void parseTag() throws IOException { 1535 Element elem = null; 1536 boolean net = false; 1537 boolean warned = false; 1538 boolean unknown = false; 1539 1540 switch (ch = readCh()) { 1541 case '!': 1542 switch (ch = readCh()) { 1543 case '-': 1544 while (true) { 1546 if (ch == '-') { 1547 if (!strict || ((ch = readCh()) == '-')) { 1548 ch = readCh(); 1549 if (!strict && ch == '-') { 1550 ch = readCh(); 1551 } 1552 if (textpos != 0) { 1556 char newtext[] = new char[textpos]; 1557 System.arraycopy(text, 0, newtext, 0, textpos); 1558 handleText(newtext); 1559 lastBlockStartPos = currentBlockStartPos; 1560 textpos = 0; 1561 } 1562 parseComment(); 1563 last = makeTag(dtd.getElement("comment"), true); 1564 handleComment(getChars(0)); 1565 continue; 1566 } else if (!warned) { 1567 warned = true; 1568 error("invalid.commentchar", "-"); 1569 } 1570 } 1571 skipSpace(); 1572 switch (ch) { 1573 case '-': 1574 continue; 1575 case '>': 1576 ch = readCh(); 1577 case -1: 1578 return; 1579 default: 1580 ch = readCh(); 1581 if (!warned) { 1582 warned = true; 1583 error("invalid.commentchar", 1584 String.valueOf((char)ch)); 1585 } 1586 break; 1587 } 1588 } 1589 1590 default: 1591 StringBuffer strBuff = new StringBuffer (); 1593 while (true) { 1594 strBuff.append((char)ch); 1595 if (parseMarkupDeclarations(strBuff)) { 1596 return; 1597 } 1598 switch(ch) { 1599 case '>': 1600 ch = readCh(); 1601 case -1: 1602 error("invalid.markup"); 1603 return; 1604 case '\n': 1605 ln++; 1606 ch = readCh(); 1607 lfCount++; 1608 break; 1609 case '\r': 1610 ln++; 1611 if ((ch = readCh()) == '\n') { 1612 ch = readCh(); 1613 crlfCount++; 1614 } 1615 else { 1616 crCount++; 1617 } 1618 break; 1619 1620 default: 1621 ch = readCh(); 1622 break; 1623 } 1624 } 1625 } 1626 1627 case '/': 1628 switch (ch = readCh()) { 1630 case '>': 1631 ch = readCh(); 1632 case '<': 1633 if (recent == null) { 1635 error("invalid.shortend"); 1636 return; 1637 } 1638 elem = recent; 1639 break; 1640 1641 default: 1642 if (!parseIdentifier(true)) { 1643 error("expected.endtagname"); 1644 return; 1645 } 1646 skipSpace(); 1647 switch (ch) { 1648 case '>': 1649 ch = readCh(); 1650 case '<': 1651 break; 1652 1653 default: 1654 error("expected", "'>'"); 1655 while ((ch != -1) && (ch != '\n') && (ch != '>')) { 1656 ch = readCh(); 1657 } 1658 if (ch == '>') { 1659 ch = readCh(); 1660 } 1661 break; 1662 } 1663 String elemStr = getString(0); 1664 if (!dtd.elementExists(elemStr)) { 1665 error("end.unrecognized", elemStr); 1666 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1668 textpos--; 1669 } 1670 elem = dtd.getElement("unknown"); 1671 elem.name = elemStr; 1672 unknown = true; 1673 } else { 1674 elem = dtd.getElement(elemStr); 1675 } 1676 break; 1677 } 1678 1679 1680 1683 if (stack == null) { 1684 error("end.extra.tag", elem.getName()); 1685 return; 1686 } 1687 1688 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1690 if (stack.pre) { 1695 if ((textpos > 1) && (text[textpos-2] != '\n')) { 1696 textpos--; 1697 } 1698 } else { 1699 textpos--; 1700 } 1701 } 1702 1703 1708 1719 1720 if (unknown) { 1721 TagElement t = makeTag(elem); 1727 handleText(t); 1728 attributes.addAttribute(HTML.Attribute.ENDTAG, "true"); 1729 handleEmptyTag(makeTag(elem)); 1730 unknown = false; 1731 return; 1732 } 1733 1734 1736 if (!strict) { 1741 String stackElem = stack.elem.getName(); 1742 1743 if (stackElem.equals("table")) { 1744 if (!elem.getName().equals(stackElem)) { 1747 error("tag.ignore", elem.getName()); 1748 return; 1749 } 1750 } 1751 1752 1753 1754 if (stackElem.equals("tr") || 1755 stackElem.equals("td")) { 1756 if ((!elem.getName().equals("table")) && 1757 (!elem.getName().equals(stackElem))) { 1758 error("tag.ignore", elem.getName()); 1759 return; 1760 } 1761 } 1762 } 1763 TagStack sp = stack; 1764 1765 while ((sp != null) && (elem != sp.elem)) { 1766 sp = sp.next; 1767 } 1768 if (sp == null) { 1769 error("unmatched.endtag", elem.getName()); 1770 return; 1771 } 1772 1773 String elemName = elem.getName(); 1779 if (stack != sp && 1780 (elemName.equals("font") || 1781 elemName.equals("center"))) { 1782 1783 if (elemName.equals("center")) { 1789 while(stack.elem.omitEnd() && stack != sp) { 1790 endTag(true); 1791 } 1792 if (stack.elem == elem) { 1793 endTag(false); 1794 } 1795 } 1796 return; 1797 } 1798 1802 1803 1804 while (stack != sp) { 1806 endTag(true); 1807 } 1808 1809 endTag(false); 1810 return; 1811 1812 case -1: 1813 error("eof"); 1814 return; 1815 } 1816 1817 if (!parseIdentifier(true)) { 1819 elem = recent; 1820 if ((ch != '>') || (elem == null)) { 1821 error("expected.tagname"); 1822 return; 1823 } 1824 } else { 1825 String elemStr = getString(0); 1826 1827 if (elemStr.equals("image")) { 1828 elemStr = new String ("img"); 1829 } 1830 1831 1832 1833 if (!dtd.elementExists(elemStr)) { 1834 error("tag.unrecognized ", elemStr); 1836 elem = dtd.getElement("unknown"); 1837 elem.name = elemStr; 1838 unknown = true; 1839 } else { 1840 elem = dtd.getElement(elemStr); 1841 } 1842 } 1843 1844 parseAttributeSpecificationList(elem); 1846 1847 switch (ch) { 1848 case '/': 1849 net = true; 1850 case '>': 1851 ch = readCh(); 1852 case '<': 1853 break; 1854 1855 default: 1856 error("expected", "'>'"); 1857 break; 1858 } 1859 1860 if (!strict) { 1861 if (elem.getName().equals("script")) { 1862 error("javascript.unsupported"); 1863 } 1864 } 1865 1866 if (!elem.isEmpty()) { 1869 if (ch == '\n') { 1870 ln++; 1871 lfCount++; 1872 ch = readCh(); 1873 } else if (ch == '\r') { 1874 ln++; 1875 if ((ch = readCh()) == '\n') { 1876 ch = readCh(); 1877 crlfCount++; 1878 } 1879 else { 1880 crCount++; 1881 } 1882 } 1883 } 1884 1885 TagElement tag = makeTag(elem, false); 1887 1888 1889 1895 1896 1906 if (!unknown) { 1910 legalTagContext(tag); 1911 1912 if (!strict && skipTag) { 1917 skipTag = false; 1918 return; 1919 } 1920 } 1921 1924 1925 startTag(tag); 1926 1927 if (!elem.isEmpty()) { 1928 switch (elem.getType()) { 1929 case CDATA: 1930 parseLiteral(false); 1931 break; 1932 case RCDATA: 1933 parseLiteral(true); 1934 break; 1935 default: 1936 if (stack != null) { 1937 stack.net = net; 1938 } 1939 break; 1940 } 1941 } 1942 } 1943 1944 1947 void parseContent() throws IOException { 1948 Thread curThread = Thread.currentThread(); 1949 1950 for (;;) { 1951 if (curThread.isInterrupted()) { 1952 curThread.interrupt(); break; 1954 } 1955 1956 int c = ch; 1957 currentBlockStartPos = currentPosition; 1958 switch (c) { 1959 case '<': 1960 parseTag(); 1961 lastBlockStartPos = currentPosition; 1962 continue; 1963 1964 case '/': 1965 ch = readCh(); 1966 if ((stack != null) && stack.net) { 1967 endTag(false); 1969 continue; 1970 } 1971 break; 1972 1973 case -1: 1974 return; 1975 1976 case '&': 1977 if (textpos == 0) { 1978 if (!legalElementContext(dtd.pcdata)) { 1979 error("unexpected.pcdata"); 1980 } 1981 if (last.breaksFlow()) { 1982 space = false; 1983 } 1984 } 1985 char data[] = parseEntityReference(); 1986 if (textpos + data.length + 1 > text.length) { 1987 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 1988 System.arraycopy(text, 0, newtext, 0, text.length); 1989 text = newtext; 1990 } 1991 if (space) { 1992 space = false; 1993 text[textpos++] = ' '; 1994 } 1995 System.arraycopy(data, 0, text, textpos, data.length); 1996 textpos += data.length; 1997 ignoreSpace = false; 1998 continue; 1999 2000 case '\n': 2001 ln++; 2002 lfCount++; 2003 ch = readCh(); 2004 if ((stack != null) && stack.pre) { 2005 break; 2006 } 2007 if (textpos == 0) { 2008 lastBlockStartPos = currentPosition; 2009 } 2010 if (!ignoreSpace) { 2011 space = true; 2012 } 2013 continue; 2014 2015 case '\r': 2016 ln++; 2017 c = '\n'; 2018 if ((ch = readCh()) == '\n') { 2019 ch = readCh(); 2020 crlfCount++; 2021 } 2022 else { 2023 crCount++; 2024 } 2025 if ((stack != null) && stack.pre) { 2026 break; 2027 } 2028 if (textpos == 0) { 2029 lastBlockStartPos = currentPosition; 2030 } 2031 if (!ignoreSpace) { 2032 space = true; 2033 } 2034 continue; 2035 2036 2037 case '\t': 2038 case ' ': 2039 ch = readCh(); 2040 if ((stack != null) && stack.pre) { 2041 break; 2042 } 2043 if (textpos == 0) { 2044 lastBlockStartPos = currentPosition; 2045 } 2046 if (!ignoreSpace) { 2047 space = true; 2048 } 2049 continue; 2050 2051 default: 2052 if (textpos == 0) { 2053 if (!legalElementContext(dtd.pcdata)) { 2054 error("unexpected.pcdata"); 2055 } 2056 if (last.breaksFlow()) { 2057 space = false; 2058 } 2059 } 2060 ch = readCh(); 2061 break; 2062 } 2063 2064 if (textpos + 2 > text.length) { 2066 char newtext[] = new char[text.length + 128]; 2067 System.arraycopy(text, 0, newtext, 0, text.length); 2068 text = newtext; 2069 } 2070 2071 if (space) { 2073 if (textpos == 0) { 2074 lastBlockStartPos--; 2075 } 2076 text[textpos++] = ' '; 2077 space = false; 2078 } 2079 text[textpos++] = (char)c; 2080 ignoreSpace = false; 2081 } 2082 } 2083 2084 2088 String getEndOfLineString() { 2089 if (crlfCount >= crCount) { 2090 if (lfCount >= crlfCount) { 2091 return "\n"; 2092 } 2093 else { 2094 return "\r\n"; 2095 } 2096 } 2097 else { 2098 if (crCount > lfCount) { 2099 return "\r"; 2100 } 2101 else { 2102 return "\n"; 2103 } 2104 } 2105 } 2106 2107 2110 public synchronized void parse(Reader in) throws IOException { 2111 this.in = in; 2112 2113 this.ln = 1; 2114 2115 seenHtml = false; 2116 seenHead = false; 2117 seenBody = false; 2118 2119 crCount = lfCount = crlfCount = 0; 2120 2121 try { 2122 try { 2123 ch = readCh(); 2124 text = new char[1024]; 2125 str = new char[128]; 2126 2127 parseContent(); 2128 while (stack != null) { 2131 endTag(true); 2132 } 2133 } finally { 2134 in.close(); 2135 } 2136 2137 } catch (IOException e) { 2138 errorContext(); 2139 error("ioexception"); 2140 throw e; 2141 } catch (Exception e) { 2142 errorContext(); 2143 error("exception", e.getClass().getName(), e.getMessage()); 2144 e.printStackTrace(); 2145 } catch (ThreadDeath e) { 2146 errorContext(); 2147 error("terminated"); 2148 e.printStackTrace(); 2149 throw e; 2150 } finally { 2151 for (; stack != null ; stack = stack.next) { 2152 handleEndTag(stack.tag); 2153 } 2154 2155 text = null; 2156 str = null; 2157 } 2158 2159 } 2160 2161 2162 2173 private char buf[] = new char[1]; 2174 private int pos; 2175 private int len; 2176 2180 private int currentPosition; 2181 2182 2183 private final int readCh() throws IOException { 2184 2185 if (pos >= len) { 2186 2187 for (;;) { 2190 try { 2191 len = in.read(buf); 2192 break; 2193 } catch (InterruptedIOException ex) { 2194 throw ex; 2195 } 2196 } 2197 2198 if (len <= 0) { 2199 return -1; } 2201 pos = 0; 2202 } 2203 ++currentPosition; 2204 2205 return buf[pos++]; 2206 } 2207 2208 2209 protected int getCurrentPos() { 2210 return currentPosition; 2211 } 2212} 2213 | Popular Tags |