| 1 5 6 package org.w3c.tidy; 7 8 33 34 57 58 import java.io.PrintWriter ; 59 import java.util.Stack ; 60 import java.util.Vector ; 61 62 public class Lexer { 63 64 65 public StreamIn in; 66 public PrintWriter errout; 67 public short badAccess; 68 public short badLayout; 69 public short badChars; 70 public short badForm; 71 public short warnings; 72 public short errors; 73 public int lines; 74 public int columns; 75 public boolean waswhite; 76 public boolean pushed; 77 public boolean insertspace; 78 public boolean excludeBlocks; 79 public boolean exiled; 80 public boolean isvoyager; 81 public short versions; 82 public int doctype; 83 public boolean badDoctype; 84 public int txtstart; 85 public int txtend; 86 public short state; 87 public Node token; 88 89 98 public byte[] lexbuf; 99 public int lexlength; 100 public int lexsize; 101 102 103 public Node inode; 104 public int insert; 105 public Stack istack; 106 public int istackbase; 107 108 public Style styles; 109 110 public Configuration configuration; 111 protected int seenBodyEndTag; 112 private Vector nodeList; 113 114 public Lexer(StreamIn in, Configuration configuration) 115 { 116 this.in = in; 117 this.lines = 1; 118 this.columns = 1; 119 this.state = LEX_CONTENT; 120 this.badAccess = 0; 121 this.badLayout = 0; 122 this.badChars = 0; 123 this.badForm = 0; 124 this.warnings = 0; 125 this.errors = 0; 126 this.waswhite = false; 127 this.pushed = false; 128 this.insertspace = false; 129 this.exiled = false; 130 this.isvoyager = false; 131 this.versions = Dict.VERS_EVERYTHING; 132 this.doctype = Dict.VERS_UNKNOWN; 133 this.badDoctype = false; 134 this.txtstart = 0; 135 this.txtend = 0; 136 this.token = null; 137 this.lexbuf = null; 138 this.lexlength = 0; 139 this.lexsize = 0; 140 this.inode = null; 141 this.insert = -1; 142 this.istack = new Stack (); 143 this.istackbase = 0; 144 this.styles = null; 145 this.configuration = configuration; 146 this.seenBodyEndTag = 0; 147 this.nodeList = new Vector (); 148 } 149 150 public Node newNode() 151 { 152 Node node = new Node(); 153 nodeList.addElement(node); 154 return node; 155 } 156 157 public Node newNode(short type, byte[] textarray, int start, int end) 158 { 159 Node node = new Node(type, textarray, start, end); 160 nodeList.addElement(node); 161 return node; 162 } 163 164 public Node newNode(short type, byte[] textarray, int start, int end, String element) 165 { 166 Node node = new Node(type, textarray, start, end, element, configuration.tt); 167 nodeList.addElement(node); 168 return node; 169 } 170 171 public Node cloneNode(Node node) 172 { 173 Node cnode = (Node)node.clone(); 174 nodeList.addElement(cnode); 175 for (AttVal att = cnode.attributes; att != null; att = att.next) { 176 if (att.asp != null) 177 nodeList.addElement(att.asp); 178 if (att.php != null) 179 nodeList.addElement(att.php); 180 } 181 return cnode; 182 } 183 184 public AttVal cloneAttributes(AttVal attrs) 185 { 186 AttVal cattrs = (AttVal)attrs.clone(); 187 for (AttVal att = cattrs; att != null; att = att.next) { 188 if (att.asp != null) 189 nodeList.addElement(att.asp); 190 if (att.php != null) 191 nodeList.addElement(att.php); 192 } 193 return cattrs; 194 } 195 196 protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray) 197 { 198 Node node; 199 for (int i = 0; i < nodeList.size(); i++) { 200 node = (Node)(nodeList.elementAt(i)); 201 if (node.textarray == oldtextarray) 202 node.textarray = newtextarray; 203 } 204 } 205 206 207 public Node newLineNode() 208 { 209 Node node = newNode(); 210 211 node.textarray = this.lexbuf; 212 node.start = this.lexsize; 213 addCharToLexer((int)'\n'); 214 node.end = this.lexsize; 215 return node; 216 } 217 218 222 public static byte[] getBytes(String str) { 223 try { 224 return str.getBytes("UTF8"); 225 } catch (java.io.UnsupportedEncodingException e) { 226 throw new Error ("string to UTF-8 conversion failed: " + e.getMessage()); 227 } 228 } 229 230 public static String getString(byte[] bytes, int offset, int length) { 231 try { 232 return new String (bytes, offset, length, "UTF8"); 233 } catch (java.io.UnsupportedEncodingException e) { 234 throw new Error ("UTF-8 to string conversion failed: " + e.getMessage()); 235 } 236 } 237 238 public boolean endOfInput() 239 { 240 return this.in.isEndOfStream(); 241 } 242 243 public void addByte(int c) 244 { 245 if (this.lexsize + 1 >= this.lexlength) 246 { 247 while (this.lexsize + 1 >= this.lexlength) 248 { 249 if (this.lexlength == 0) 250 this.lexlength = 8192; 251 else 252 this.lexlength = this.lexlength * 2; 253 } 254 255 byte[] temp = this.lexbuf; 256 this.lexbuf = new byte[ this.lexlength ]; 257 if (temp != null) 258 { 259 System.arraycopy( temp, 0, this.lexbuf, 0, temp.length ); 260 updateNodeTextArrays(temp, this.lexbuf); 261 } 262 } 263 264 this.lexbuf[this.lexsize++] = (byte)c; 265 this.lexbuf[this.lexsize] = (byte)'\0'; 266 } 267 268 public void changeChar(byte c) 269 { 270 if (this.lexsize > 0) 271 { 272 this.lexbuf[this.lexsize-1] = c; 273 } 274 } 275 276 277 public void addCharToLexer(int c) 278 { 279 if (c < 128) 280 addByte(c); 281 else if (c <= 0x7FF) 282 { 283 addByte(0xC0 | (c >> 6)); 284 addByte(0x80 | (c & 0x3F)); 285 } 286 else if (c <= 0xFFFF) 287 { 288 addByte(0xE0 | (c >> 12)); 289 addByte(0x80 | ((c >> 6) & 0x3F)); 290 addByte(0x80 | (c & 0x3F)); 291 } 292 else if (c <= 0x1FFFFF) 293 { 294 addByte(0xF0 | (c >> 18)); 295 addByte(0x80 | ((c >> 12) & 0x3F)); 296 addByte(0x80 | ((c >> 6) & 0x3F)); 297 addByte(0x80 | (c & 0x3F)); 298 } 299 else 300 { 301 addByte(0xF8 | (c >> 24)); 302 addByte(0x80 | ((c >> 18) & 0x3F)); 303 addByte(0x80 | ((c >> 12) & 0x3F)); 304 addByte(0x80 | ((c >> 6) & 0x3F)); 305 addByte(0x80 | (c & 0x3F)); 306 } 307 } 308 309 public void addStringToLexer(String str) 310 { 311 for ( int i = 0; i < str.length(); i++ ) { 312 addCharToLexer( (int)str.charAt(i) ); 313 } 314 } 315 316 327 public void parseEntity(short mode) 328 { 329 short map; 330 int start; 331 boolean first = true; 332 boolean semicolon = false; 333 boolean numeric = false; 334 int c, ch, startcol; 335 String str; 336 337 start = this.lexsize - 1; 338 startcol = this.in.curcol - 1; 339 340 while (true) 341 { 342 c = this.in.readChar(); 343 if (c == StreamIn.EndOfStream) break; 344 if (c == ';') 345 { 346 semicolon = true; 347 break; 348 } 349 350 if (first && c == '#') 351 { 352 addCharToLexer(c); 353 first = false; 354 numeric = true; 355 continue; 356 } 357 358 first = false; 359 map = MAP((char)c); 360 361 365 if (numeric && ((c == 'x') || ((map & DIGIT) != 0))) 366 { 367 addCharToLexer(c); 368 continue; 369 } 370 if (!numeric && ((map & NAMECHAR) != 0)) 371 { 372 addCharToLexer(c); 373 continue; 374 } 375 376 377 378 this.in.ungetChar(c); 379 break; 380 } 381 382 str = getString( this.lexbuf, start, this.lexsize - start ); 383 ch = EntityTable.getDefaultEntityTable().entityCode( str ); 384 385 386 if (ch <= 0) 387 { 388 389 this.lines = this.in.curline; 390 this.columns = startcol; 391 392 if (this.lexsize > start +1 ) 393 { 394 Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch); 395 396 if (semicolon) 397 addCharToLexer(';'); 398 } 399 else 400 { 401 Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch); 402 } 403 } 404 else 405 { 406 if (c != ';') 407 { 408 409 this.lines = this.in.curline; 410 this.columns = startcol; 411 Report.entityError(this, Report.MISSING_SEMICOLON, str, c); 412 } 413 414 this.lexsize = start; 415 416 if (ch == 160 && (mode & Preformatted) != 0) 417 ch = ' '; 418 419 addCharToLexer(ch); 420 421 if (ch == '&' && !this.configuration.QuoteAmpersand) 422 { 423 addCharToLexer('a'); 424 addCharToLexer('m'); 425 addCharToLexer('p'); 426 addCharToLexer(';'); 427 } 428 } 429 } 430 431 public char parseTagName() 432 { 433 short map; 434 int c; 435 436 437 438 c = this.lexbuf[this.txtstart]; 439 map = MAP((char)c); 440 441 if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) 442 { 443 c += (int)((int)'a' - (int)'A'); 444 this.lexbuf[this.txtstart] = (byte)c; 445 } 446 447 while (true) 448 { 449 c = this.in.readChar(); 450 if (c == StreamIn.EndOfStream) break; 451 map = MAP((char)c); 452 453 if ((map & NAMECHAR) == 0) 454 break; 455 456 457 458 if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) 459 c += (int)((int)'a' - (int)'A'); 460 461 addCharToLexer(c); 462 } 463 464 this.txtend = this.lexsize; 465 return (char)c; 466 } 467 468 public void addStringLiteral(String str) 469 { 470 for ( int i = 0; i < str.length(); i++ ) { 471 addCharToLexer( (int)str.charAt(i) ); 472 } 473 } 474 475 476 public short HTMLVersion() 477 { 478 short versions; 479 480 versions = this.versions; 481 482 if ((versions & Dict.VERS_HTML20) != 0) 483 return Dict.VERS_HTML20; 484 485 if ((versions & Dict.VERS_HTML32) != 0) 486 return Dict.VERS_HTML32; 487 488 if ((versions & Dict.VERS_HTML40_STRICT) != 0) 489 return Dict.VERS_HTML40_STRICT; 490 491 if ((versions & Dict.VERS_HTML40_LOOSE) != 0) 492 return Dict.VERS_HTML40_LOOSE; 493 494 if ((versions & Dict.VERS_FRAMES) != 0) 495 return Dict.VERS_FRAMES; 496 497 return Dict.VERS_UNKNOWN; 498 } 499 500 public String HTMLVersionName() 501 { 502 short guessed; 503 int j; 504 505 guessed = apparentVersion(); 506 507 for (j = 0; j < W3CVersion.length; ++j) 508 { 509 if (guessed == W3CVersion[j].code) 510 { 511 if (this.isvoyager) 512 return W3CVersion[j].voyagerName; 513 514 return W3CVersion[j].name; 515 } 516 } 517 518 return null; 519 } 520 521 522 public boolean addGenerator(Node root) 523 { 524 AttVal attval; 525 Node node; 526 Node head = root.findHEAD(configuration.tt); 527 528 if (head != null) 529 { 530 for (node = head.content; node != null; node = node.next) 531 { 532 if (node.tag == configuration.tt.tagMeta) 533 { 534 attval = node.getAttrByName("name"); 535 536 if (attval != null && attval.value != null && 537 Lexer.wstrcasecmp(attval.value, "generator") == 0) 538 { 539 attval = node.getAttrByName("content"); 540 541 if (attval != null && attval.value != null && 542 attval.value.length() >= 9 && 543 Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0) 544 { 545 return false; 546 } 547 } 548 } 549 } 550 551 node = this.inferredTag("meta"); 552 node.addAttribute("content", "HTML Tidy, see www.w3.org"); 553 node.addAttribute("name", "generator"); 554 Node.insertNodeAtStart(head, node); 555 return true; 556 } 557 558 return false; 559 } 560 561 562 563 564 private static boolean findBadSubString(String s, String p, int len) 565 { 566 int n = s.length(); 567 int i = 0; 568 String ps; 569 570 while (n < len) 571 { 572 ps = p.substring(i, i + n); 573 if (wstrcasecmp(s, ps) == 0) 574 return (!ps.equals(s.substring(0, n))); 575 576 ++i; 577 --len; 578 } 579 580 return false; 581 } 582 583 public boolean checkDocTypeKeyWords(Node doctype) 584 { 585 int len = doctype.end - doctype.start; 586 String s = getString(this.lexbuf, doctype.start, len); 587 588 return !( 589 findBadSubString("SYSTEM", s, len) || 590 findBadSubString("PUBLIC", s, len) || 591 findBadSubString("//DTD", s, len) || 592 findBadSubString("//W3C", s, len) || 593 findBadSubString("//EN", s, len) 594 ); 595 } 596 597 598 public short findGivenVersion(Node doctype) 599 { 600 String p, s; 601 int i, j; 602 int len; 603 String str1; 604 String str2; 605 606 607 str1 = getString(this.lexbuf, doctype.start, 5); 608 if (wstrcasecmp(str1, "html ") != 0) 609 return 0; 610 611 if (!checkDocTypeKeyWords(doctype)) 612 Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE); 613 614 615 str1 = getString(this.lexbuf, doctype.start + 5, 7); 616 if (wstrcasecmp(str1, "SYSTEM ") == 0) 617 { 618 619 if (!str1.substring(0, 6).equals("SYSTEM")) 620 System.arraycopy( getBytes("SYSTEM"), 0, 621 this.lexbuf, doctype.start + 5, 6 ); 622 return 0; 623 } 624 625 if (wstrcasecmp(str1, "PUBLIC ") == 0) 626 { 627 if (!str1.substring(0, 6).equals("PUBLIC")) 628 System.arraycopy( getBytes("PUBLIC "), 0, 629 this.lexbuf, doctype.start + 5, 6 ); 630 } 631 else 632 this.badDoctype = true; 633 634 for (i = doctype.start; i < doctype.end; ++i) 635 { 636 if (this.lexbuf[i] == (byte)'"') 637 { 638 str1 = getString( this.lexbuf, i + 1, 12 ); 639 str2 = getString( this.lexbuf, i + 1, 13 ); 640 if (str1.equals("-//W3C//DTD ")) 641 { 642 643 for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j); 644 len = j - i - 13; 645 p = getString( this.lexbuf, i + 13, len ); 646 647 for (j = 1; j < W3CVersion.length; ++j) 648 { 649 s = W3CVersion[j].name; 650 if (len == s.length() && s.equals(p)) 651 return W3CVersion[j].code; 652 } 653 654 655 } 656 else if (str2.equals("-//IETF//DTD ")) 657 { 658 659 for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j); 660 len = j - i - 14; 661 662 p = getString( this.lexbuf, i + 14, len ); 663 s = W3CVersion[0].name; 664 if (len == s.length() && s.equals(p)) 665 return W3CVersion[0].code; 666 667 668 } 669 break; 670 } 671 } 672 673 return 0; 674 } 675 676 public void fixHTMLNameSpace(Node root, String profile) 677 { 678 Node node; 679 AttVal prev, attr; 680 681 for (node = root.content; 682 node != null && node.tag != configuration.tt.tagHtml; node = node.next); 683 684 if (node != null) 685 { 686 prev = null; 687 688 for (attr = node.attributes; attr != null; attr = attr.next) 689 { 690 if (attr.attribute.equals("xmlns")) 691 break; 692 693 prev = attr; 694 } 695 696 if (attr != null) 697 { 698 if (!attr.value.equals(profile)) 699 { 700 Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE); 701 attr.value = profile; 702 } 703 } 704 else 705 { 706 attr = new AttVal( node.attributes, null, (int)'"', 707 "xmlns", profile ); 708 attr.dict = 709 AttributeTable.getDefaultAttributeTable().findAttribute( attr ); 710 node.attributes = attr; 711 } 712 } 713 } 714 715 public boolean setXHTMLDocType(Node root) 716 { 717 String fpi = " "; 718 String sysid = ""; 719 String namespace = XHTML_NAMESPACE; 720 Node doctype; 721 722 doctype = root.findDocType(); 723 724 if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) 725 { 726 if (doctype != null) 727 Node.discardElement(doctype); 728 return true; 729 } 730 731 if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) 732 { 733 734 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) 735 { 736 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; 737 sysid = voyager_strict; 738 } 739 else if ((this.versions & Dict.VERS_LOOSE) != 0) 740 { 741 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; 742 sysid = voyager_loose; 743 } 744 else if ((this.versions & Dict.VERS_FRAMES) != 0) 745 { 746 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN"; 747 sysid = voyager_frameset; 748 } 749 else 750 { 751 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; 752 sysid = voyager_loose; 753 } 754 } 755 else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) 756 { 757 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; 758 sysid = voyager_strict; 759 } 760 else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) 761 { 762 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; 763 sysid = voyager_loose; 764 } 765 766 fixHTMLNameSpace(root, namespace); 767 768 if (doctype == null) 769 { 770 doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0); 771 doctype.next = root.content; 772 doctype.parent = root; 773 doctype.prev = null; 774 root.content = doctype; 775 } 776 777 if (configuration.docTypeMode == Configuration.DOCTYPE_USER && 778 configuration.docTypeStr != null) 779 { 780 fpi = configuration.docTypeStr; 781 sysid = ""; 782 } 783 784 this.txtstart = this.lexsize; 785 this.txtend = this.lexsize; 786 787 788 addStringLiteral("html PUBLIC "); 789 790 791 if (fpi.charAt(0) == '"') 792 addStringLiteral(fpi); 793 else 794 { 795 addStringLiteral("\""); 796 addStringLiteral(fpi); 797 addStringLiteral("\""); 798 } 799 800 if (sysid.length() + 6 >= this.configuration.wraplen) 801 addStringLiteral("\n\""); 802 else 803 addStringLiteral("\n \""); 804 805 806 addStringLiteral(sysid); 807 addStringLiteral("\""); 808 809 this.txtend = this.lexsize; 810 811 doctype.start = this.txtstart; 812 doctype.end = this.txtend; 813 814 return false; 815 } 816 817 public short apparentVersion() 818 { 819 switch (this.doctype) 820 { 821 case Dict.VERS_UNKNOWN: 822 return HTMLVersion(); 823 824 case Dict.VERS_HTML20: 825 if ((this.versions & Dict.VERS_HTML20) != 0) 826 return Dict.VERS_HTML20; 827 828 break; 829 830 case Dict.VERS_HTML32: 831 if ((this.versions & Dict.VERS_HTML32) != 0) 832 return Dict.VERS_HTML32; 833 834 break; 835 836 case Dict.VERS_HTML40_STRICT: 837 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) 838 return Dict.VERS_HTML40_STRICT; 839 840 break; 841 842 case Dict.VERS_HTML40_LOOSE: 843 if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0) 844 return Dict.VERS_HTML40_LOOSE; 845 846 break; 847 848 case Dict.VERS_FRAMES: 849 if (( |