1 37 38 package org.htmlcleaner; 39 40 import java.io.*; 41 import java.net.URL ; 42 import java.util.*; 43 44 63 public class HtmlCleaner { 64 65 public static final String DEFAULT_CHARSET = System.getProperty("file.encoding"); 66 67 private static final int WRITE_METHOD_SIMPLE = 0; 68 private static final int WRITE_METHOD_COMPACT = 1; 69 private static final int WRITE_METHOD_PRETTY = 2; 70 71 74 private class TagPos { 75 private int position; 76 private String name; 77 private TagInfo info; 78 79 TagPos(int position, String name) { 80 this.position = position; 81 this.name = name; 82 this.info = tagInfoProvider.getTagInfo(name); 83 } 84 } 85 86 90 private class OpenTags { 91 private List list = new ArrayList(); 92 private TagPos last = null; 93 private Set set = new HashSet(); 94 95 private boolean isEmpty() { 96 return list.isEmpty(); 97 } 98 99 private void addTag(String tagName, int position) { 100 last = new TagPos(position, tagName); 101 list.add(last); 102 set.add(tagName); 103 } 104 105 private void removeTag(String tagName) { 106 ListIterator it = list.listIterator( list.size() ); 107 while ( it.hasPrevious() ) { 108 TagPos currTagPos = (TagPos) it.previous(); 109 if (tagName.equals(currTagPos.name)) { 110 it.remove(); 111 break; 112 } 113 } 114 115 last = list.isEmpty() ? null : (TagPos) list.get( list.size() - 1 ); 116 } 117 118 private TagPos findFirstTagPos() { 119 return list.isEmpty() ? null : (TagPos) list.get(0); 120 } 121 122 private TagPos getLastTagPos() { 123 return last; 124 } 125 126 private TagPos findTag(String tagName) { 127 if (tagName != null) { 128 ListIterator it = list.listIterator( list.size() ); 129 while ( it.hasPrevious() ) { 130 TagPos currTagPos = (TagPos) it.previous(); 131 if (tagName.equals(currTagPos.name)) { 132 return currTagPos; 133 } 134 } 135 } 136 137 return null; 138 } 139 140 private boolean tagExists(String tagName) { 141 TagPos tagPos = findTag(tagName); 142 return tagPos != null; 143 } 144 145 private TagPos findTagToPlaceRubbish() { 146 TagPos result = null, prev = null; 147 148 if ( !isEmpty() ) { 149 ListIterator it = list.listIterator( list.size() ); 150 while ( it.hasPrevious() ) { 151 result = (TagPos) it.previous(); 152 if ( result.info == null || result.info.allowsAnything() ) { 153 if (prev != null) { 154 return prev; 155 } 156 } 157 prev = result; 158 } 159 } 160 161 return result; 162 } 163 164 private boolean tagEncountered(String tagName) { 165 return set.contains(tagName); 166 } 167 168 172 private boolean someAlreadyOpen(Set tags) { 173 Iterator it = list.iterator(); 174 while ( it.hasNext() ) { 175 TagPos curr = (TagPos) it.next(); 176 if ( tags.contains(curr.name) ) { 177 return true; 178 } 179 } 180 181 182 return false; 183 } 184 } 185 186 private ITagInfoProvider tagInfoProvider; 187 188 private Reader reader; 189 private transient OpenTags _openTags = new OpenTags(); 190 private transient DoctypeToken _docType = null; 191 private Set allTags = new TreeSet(); 192 193 private boolean advancedXmlEscape = true; 194 private boolean useCdataForScriptAndStyle = true; 195 private boolean translateSpecialEntities = true; 196 private boolean recognizeUnicodeChars = true; 197 private boolean omitUnknownTags = false; 198 private boolean omitDeprecatedTags = false; 199 private boolean omitComments = false; 200 private boolean omitXmlDeclaration = false; 201 private boolean omitDoctypeDeclaration = true; 202 private boolean omitXmlnsAttributes = false; 203 private String hyphenReplacementInComment = "="; 204 205 private TagNode htmlNode; 206 private TagNode bodyNode; 207 private TagNode headNode; 208 209 214 public HtmlCleaner(String htmlContent, ITagInfoProvider tagInfoProvider) { 215 this.reader = new StringReader(htmlContent); 216 this.tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider.getInstance() : tagInfoProvider; 217 } 218 223 public HtmlCleaner(String htmlContent) { 224 this(htmlContent, HtmlTagProvider.getInstance()); 225 } 226 227 233 public HtmlCleaner(File file, String charset, ITagInfoProvider tagInfoProvider) throws IOException { 234 FileInputStream in = new FileInputStream(file); 235 this.reader = new InputStreamReader(in, charset); 236 this.tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider.getInstance() : tagInfoProvider; 237 } 238 239 245 public HtmlCleaner(File file, String charset) throws IOException { 246 this(file, charset, HtmlTagProvider.getInstance()); 247 } 248 249 254 public HtmlCleaner(File file, ITagInfoProvider tagInfoProvider) throws IOException { 255 this(file, DEFAULT_CHARSET, tagInfoProvider); 256 } 257 258 263 public HtmlCleaner(File file) throws IOException { 264 this(file, DEFAULT_CHARSET, HtmlTagProvider.getInstance()); 265 } 266 267 273 public HtmlCleaner(URL url, String charset, ITagInfoProvider tagInfoProvider) throws IOException { 274 StringBuffer content = Utils.readUrl(url, charset); 275 this.reader = new StringReader( content.toString() ); 276 this.tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider.getInstance() : tagInfoProvider; 277 } 278 279 285 public HtmlCleaner(URL url, ITagInfoProvider tagInfoProvider) throws IOException { 286 this(url, DEFAULT_CHARSET, tagInfoProvider); 287 } 288 289 295 public HtmlCleaner(URL url, String charset) throws IOException { 296 this(url, charset, HtmlTagProvider.getInstance()); 297 } 298 299 304 public HtmlCleaner(URL url) throws IOException { 305 this(url, DEFAULT_CHARSET, HtmlTagProvider.getInstance()); 306 } 307 308 313 public HtmlCleaner(InputStream in, ITagInfoProvider tagInfoProvider) { 314 this.reader = new InputStreamReader(in); 315 this.tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider.getInstance() : tagInfoProvider; 316 } 317 318 322 public HtmlCleaner(InputStream in) { 323 this(in, HtmlTagProvider.getInstance()); 324 } 325 326 DoctypeToken getDoctype() { 327 return _docType; 328 } 329 330 void setDoctype(DoctypeToken type) { 331 _docType = type; 332 } 333 334 341 public HtmlCleaner(InputStream in, String charset) throws IOException { 342 reader = new InputStreamReader(in, charset); 343 } 344 345 public void clean() throws IOException { 346 allTags.clear(); 347 348 htmlNode = new TagNode("html"); 349 bodyNode = new TagNode("body"); 350 headNode = new TagNode("head"); 351 htmlNode.addChild(headNode); 352 htmlNode.addChild(bodyNode); 353 354 HtmlTokenizer htmlTokenizer = new HtmlTokenizer(this); 355 356 htmlTokenizer.start(); 357 358 List nodeList = htmlTokenizer.getTokenList(); 359 closeAll(nodeList); 360 createDocumentNodes(nodeList); 361 } 362 363 Reader getReader() { 364 return reader; 365 } 366 367 373 private void addAttributesToTag(TagNode tag, Map attributes) { 374 if (attributes != null) { 375 Map tagAttributes = tag.getAttributes(); 376 Iterator it = attributes.entrySet().iterator(); 377 while (it.hasNext()) { 378 Map.Entry currEntry = (Map.Entry) it.next(); 379 String attName = (String ) currEntry.getKey(); 380 if ( !tagAttributes.containsKey(attName) ) { 381 String attValue = (String ) currEntry.getValue(); 382 tag.addAttribute(attName, attValue); 383 } 384 } 385 } 386 } 387 388 393 private boolean isFatalTagSatisfied(TagInfo tag) { 394 if (tag != null) { 395 String fatalTagName = tag.getFatalTag(); 396 return fatalTagName == null ? true : _openTags.tagExists(fatalTagName); 397 } 398 399 return true; 400 } 401 402 407 private boolean mustAddRequiredParent(TagInfo tag) { 408 if (tag != null) { 409 String requiredParent = tag.getRequiredParent(); 410 if (requiredParent != null) { 411 String fatalTag = tag.getFatalTag(); 412 int fatalTagPositon = -1; 413 if (fatalTag != null) { 414 TagPos tagPos =_openTags.findTag(fatalTag); 415 if (tagPos != null) { 416 fatalTagPositon = tagPos.position; 417 } 418 } 419 420 ListIterator it = _openTags.list.listIterator( _openTags.list.size() ); 422 while ( it.hasPrevious() ) { 423 TagPos currTagPos = (TagPos) it.previous(); 424 if (tag.isHigher(currTagPos.name)) { 425 return currTagPos.position <= fatalTagPositon; 426 } 427 } 428 429 return true; 430 } 431 } 432 433 return false; 434 } 435 436 private TagNode createTagNode(TagNode startTagToken) { 437 startTagToken.setFormed(); 438 return startTagToken; 439 } 440 441 private boolean isAllowedInLastOpenTag(BaseToken token) { 442 TagPos last = _openTags.getLastTagPos(); 443 if (last != null) { 444 if (last.info != null) { 445 return last.info.allowsItem(token); 446 } 447 } 448 449 return true; 450 } 451 452 private void saveToLastOpenTag(List nodeList, Object tokenToAdd) { 453 TagPos last = _openTags.getLastTagPos(); 454 if ( last != null && last.info != null && last.info.isIgnorePermitted() ) { 455 return; 456 } 457 458 TagPos rubbishPos = _openTags.findTagToPlaceRubbish(); 459 if (rubbishPos != null) { 460 TagNode startTagToken = (TagNode) nodeList.get(rubbishPos.position); 461 startTagToken.addItemForMoving(tokenToAdd); 462 } 463 } 464 465 private boolean isStartToken(Object o) { 466 return (o instanceof TagNode) && !((TagNode)o).isFormed(); 467 } 468 469 void makeTree(List nodeList, ListIterator nodeIterator) { 470 while ( nodeIterator.hasNext() ) { 472 BaseToken token = (BaseToken) nodeIterator.next(); 473 474 if (token instanceof EndTagToken) { 475 EndTagToken endTagToken = (EndTagToken) token; 476 String tagName = endTagToken.getName(); 477 TagInfo tag = tagInfoProvider.getTagInfo(tagName); 478 479 if ( (tag == null && omitUnknownTags) || (tag != null && tag.isDeprecated() && omitDeprecatedTags) ) { 480 nodeIterator.set(null); 481 } else if ( tag != null && !tag.allowsBody() ) { 482 nodeIterator.set(null); 483 } else { 484 TagPos matchingPosition = _openTags.findTag(tagName); 485 486 if (matchingPosition != null) { 487 closeSnippet(nodeList, matchingPosition, endTagToken); 488 } else if ( !isAllowedInLastOpenTag(token) ) { 489 saveToLastOpenTag(nodeList, token); 490 } 491 492 nodeIterator.set(null); 493 } 494 } else if ( isStartToken(token) ) { 495 TagNode startTagToken = (TagNode) token; 496 String tagName = startTagToken.getName(); 497 TagInfo tag = tagInfoProvider.getTagInfo(tagName); 498 499 allTags.add(tagName); 501 502 if ( "html".equals(tagName) ) { 504 addAttributesToTag(htmlNode, startTagToken.getAttributes()); 505 nodeIterator.set(null); 506 } else if ( "body".equals(tagName) ) { 508 addAttributesToTag(bodyNode, startTagToken.getAttributes()); 509 nodeIterator.set(null); 510 } else if ( "head".equals(tagName) ) { 512 addAttributesToTag(headNode, startTagToken.getAttributes()); 513 nodeIterator.set(null); 514 } else if ( (tag == null && omitUnknownTags) || (tag != null && tag.isDeprecated() && omitDeprecatedTags) ) { 516 nodeIterator.set(null); 517 } else if ( tag != null && tag.hasPermittedTags() && _openTags.someAlreadyOpen(tag.getPermittedTags()) ) { 518 nodeIterator.set(null); 519 } else if ( tag != null && tag.isUnique() && _openTags.tagEncountered(tagName) ) { 521 nodeIterator.set(null); 522 } else if ( !isFatalTagSatisfied(tag) ) { 524 nodeIterator.set(null); 525 } else if ( mustAddRequiredParent(tag) ) { 527 String requiredParent = tag.getRequiredParent(); 528 TagNode requiredParentStartToken = new TagNode(requiredParent); 529 nodeIterator.previous(); 530 nodeIterator.add(requiredParentStartToken); 531 nodeIterator.previous(); 532 } else if ( tag != null && !_openTags.isEmpty() && tag.isMustCloseTag( tagInfoProvider.getTagInfo(_openTags.getLastTagPos().name)) ) { 534 List closed = closeSnippet(nodeList, _openTags.getLastTagPos(), startTagToken); 535 int closedCount = closed.size(); 536 537 if ( tag.hasCopyTags() && closedCount > 0 ) { 539 ListIterator closedIt = closed.listIterator(closedCount); 542 List toBeCopied = new ArrayList(); 543 while (closedIt.hasPrevious()) { 544 TagNode currStartToken = (TagNode) closedIt.previous(); 545 if ( tag.isCopy(currStartToken.getName()) ) { 546 toBeCopied.add(0, currStartToken); 547 } else { 548 break; 549 } 550 } 551 552 if (toBeCopied.size() > 0) { 553 Iterator copyIt = toBeCopied.iterator(); 554 while (copyIt.hasNext()) { 555 TagNode currStartToken = (TagNode) copyIt.next(); 556 nodeIterator.add( currStartToken.makeCopy() ); 557 } 558 559 for (int i = 0; i < toBeCopied.size(); i++) { 561 nodeIterator.previous(); 562 } 563 } 564 } 565 566 nodeIterator.previous(); 567 } else if ( !isAllowedInLastOpenTag(token) ) { 569 saveToLastOpenTag(nodeList, token); 570 nodeIterator.set(null); 571 } else if ( tag != null && !tag.allowsBody() ) { 573 TagNode newTagNode = createTagNode(startTagToken); 574 if ( tag.isHeadTag() ) { 575 headNode.addChild(newTagNode); 576 nodeIterator.set(null); 577 } else { 578 nodeIterator.set(newTagNode); 579 } 580 } else { 582 _openTags.addTag( tagName, nodeIterator.previousIndex() ); 583 } 584 } else { 585 if ( !isAllowedInLastOpenTag(token) ) { 586 saveToLastOpenTag(nodeList, token); 587 nodeIterator.set(null); 588 } 589 } 590 } 591 } 592 593 private void createDocumentNodes(List listNodes) { 594 Iterator it = listNodes.iterator(); 595 while (it.hasNext()) { 596 Object child = it.next(); 597 598 if (child == null) { 599 continue; 600 } 601 602 TagNode parent = bodyNode; 603 boolean toAdd = true; 604 605 if (child instanceof TagNode) { 606 TagInfo tag = tagInfoProvider.getTagInfo( ((TagNode)child).getName() ); 607 if (tag != null) { 608 if ( tag.isHeadTag() || (tag.isHeadAndBodyTag() && bodyNode.getChildren().isEmpty()) ) { 609 parent = headNode; 610 } 611 } 612 } else { 613 if (child instanceof ContentToken) { 614 toAdd = !"".equals( ((ContentToken)child).toString() ); 615 } 616 } 617 618 if (toAdd) { 619 parent.addChild(child); 620 } 621 } 622 } 623 624 private List closeSnippet(List nodeList, TagPos tagPos, Object toNode) { 625 List closed = new ArrayList(); 626 ListIterator it = nodeList.listIterator(tagPos.position); 627 628 TagNode tagNode = null; 629 Object item = it.next(); 630 boolean isListEnd = false; 631 632 while ( (toNode == null && !isListEnd) || (toNode != null && item != toNode) ) { 633 if ( isStartToken(item) ) { 634 TagNode startTagToken = (TagNode) item; 635 closed.add(startTagToken); 636 List itemsToMove = startTagToken.getItemsToMove(); 637 if (itemsToMove != null) { 638 OpenTags prevOpenTags = _openTags; 639 _openTags = new OpenTags(); 640 makeTree(itemsToMove, itemsToMove.listIterator(0)); 641 closeAll(itemsToMove); 642 startTagToken.setItemsToMove(null); 643 _openTags = prevOpenTags; 644 } 645 646 TagNode newTagNode = createTagNode(startTagToken); 647 648 TagInfo tag = tagInfoProvider.getTagInfo( newTagNode.getName() ); 649 if ( tag != null && tag.isHeadTag() ) { 650 headNode.addChild(newTagNode); 651 it.set(null); 652 } else if (tagNode != null) { 653 tagNode.addChildren(itemsToMove); 654 tagNode.addChild(newTagNode); 655 it.set(null); 656 } else { 657 if (itemsToMove != null) { 658 itemsToMove.add(newTagNode); 659 it.set(itemsToMove); 660 } else { 661 it.set(newTagNode); 662 } 663 } 664 665 _openTags.removeTag( newTagNode.getName() ); 666 tagNode = newTagNode; 667 } else { 668 if (tagNode != null) { 669 it.set(null); 670 if (item != null) { 671 tagNode.addChild(item); 672 } 673 } 674 } 675 676 if ( it.hasNext() ) { 677 item = it.next(); 678 } else { 679 isListEnd = true; 680 } 681 } 682 683 return closed; 684 } 685 686 689 private void closeAll(List nodeList) { 690 TagPos firstTagPos = _openTags.findFirstTagPos(); 691 if (firstTagPos != null) { 692 closeSnippet(nodeList, firstTagPos, null); 693 } 694 } 695 696 698 public boolean isOmitUnknownTags() { 699 return omitUnknownTags; 700 } 701 702 public void setOmitUnknownTags(boolean omitUnknownTags) { 703 this.omitUnknownTags = omitUnknownTags; 704 } 705 706 public boolean isOmitDeprecatedTags() { 707 return omitDeprecatedTags; 708 } 709 710 public void setOmitDeprecatedTags(boolean omitDeprecatedTags) { 711 this.omitDeprecatedTags = omitDeprecatedTags; 712 } 713 714 public boolean isAdvancedXmlEscape() { 715 return advancedXmlEscape; 716 } 717 718 public void setAdvancedXmlEscape(boolean advancedXmlEscape) { 719 this.advancedXmlEscape = advancedXmlEscape; 720 } 721 722 public boolean isUseCdataForScriptAndStyle() { 723 return useCdataForScriptAndStyle; 724 } 725 726 public void setUseCdataForScriptAndStyle(boolean useCdataForScriptAndStyle) { 727 this.useCdataForScriptAndStyle = useCdataForScriptAndStyle; 728 } 729 730 public boolean isTranslateSpecialEntities() { 731 return translateSpecialEntities; 732 } 733 734 public void setTranslateSpecialEntities(boolean translateSpecialEntities) { 735 this.translateSpecialEntities = translateSpecialEntities; 736 } 737 738 public boolean isRecognizeUnicodeChars() { 739 return recognizeUnicodeChars; 740 } 741 742 public void setRecognizeUnicodeChars(boolean recognizeUnicodeChars) { 743 this.recognizeUnicodeChars = recognizeUnicodeChars; 744 } 745 746 public boolean isOmitComments() { 747 return omitComments; 748 } 749 750 public void setOmitComments(boolean omitComments) { 751 this.omitComments = omitComments; 752 } 753 754 public boolean isOmitXmlDeclaration() { 755 return omitXmlDeclaration; 756 } 757 758 public void setOmitXmlDeclaration(boolean omitXmlDeclaration) { 759 this.omitXmlDeclaration = omitXmlDeclaration; 760 } 761 762 public boolean isOmitDoctypeDeclaration() { 763 return omitDoctypeDeclaration; 764 } 765 766 public void setOmitDoctypeDeclaration(boolean omitDoctypeDeclaration) { 767 this.omitDoctypeDeclaration = omitDoctypeDeclaration; 768 } 769 770 public boolean isOmitXmlnsAttributes() { 771 return omitXmlnsAttributes; 772 } 773 774 public void setOmitXmlnsAttributes(boolean omitXmlnsAttributes) { 775 this.omitXmlnsAttributes = omitXmlnsAttributes; 776 } 777 778 public String getHyphenReplacementInComment() { 779 return hyphenReplacementInComment; 780 } 781 782 public void setHyphenReplacementInComment(String hyphenReplacementInComment) { 783 this.hyphenReplacementInComment = hyphenReplacementInComment; 784 } 785 786 public Set getAllTags() { 787 return allTags; 788 } 789 790 792 797 public void writeXml(XmlSerializer xmlSerializer) throws IOException { 798 xmlSerializer.createXml(htmlNode); 799 } 800 801 private void writeXml(Writer writer, int method) throws IOException { 802 XmlSerializer xmlSerializer = null; 803 804 if (WRITE_METHOD_COMPACT == method) { 805 xmlSerializer = new CompactXmlSerializer(writer, this); 806 } else if (WRITE_METHOD_PRETTY == method) { 807 xmlSerializer = new PrettyXmlSerializer(writer, this); 808 } else { 809 xmlSerializer = new SimpleXmlSerializer(writer, this); 810 } 811 812 xmlSerializer.createXml(htmlNode); 813 } 814 815 private void writeToStream(OutputStream out, String charset, int method) throws IOException { 816 BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(out, charset) ); 817 writeXml(writer, method); 818 } 819 820 private void writeToStream(OutputStream out, int method) throws IOException { 821 BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(out) ); 822 writeXml(writer, method); 823 } 824 825 public void writeXmlToStream(OutputStream out) throws IOException { 826 writeToStream(out, WRITE_METHOD_SIMPLE); 827 } 828 829 public void writeXmlToStream(OutputStream out, String charset) throws IOException { 830 writeToStream(out, charset, WRITE_METHOD_SIMPLE); 831 } 832 833 public void writeCompactXmlToStream(OutputStream out) throws IOException { 834 writeToStream(out, WRITE_METHOD_COMPACT); 835 } 836 837 public void writeCompactXmlToStream(OutputStream out, String charset) throws IOException { 838 writeToStream(out, charset, WRITE_METHOD_COMPACT); 839 } 840 841 public void writePrettyXmlToStream(OutputStream out) throws IOException { 842 writeToStream(out, WRITE_METHOD_PRETTY); 843 } 844 845 public void writePrettyXmlToStream(OutputStream out, String charset) throws IOException { 846 writeToStream(out, charset, WRITE_METHOD_PRETTY); 847 } 848 849 private void writeToFile(String fileName, String charset, int method) throws IOException { 850 writeToStream(new FileOutputStream(fileName), charset, method ); 851 } 852 853 private void writeToFile(String fileName, int method) throws IOException { 854 writeToStream( new FileOutputStream(fileName), method ); 855 } 856 857 public void writeXmlToFile(String fileName) throws IOException { 858 writeToFile(fileName, WRITE_METHOD_SIMPLE); 859 } 860 861 public void writeXmlToFile(String fileName, String charset) throws IOException { 862 writeToFile(fileName, charset, WRITE_METHOD_SIMPLE); 863 } 864 865 public void writeCompactXmlToFile(String fileName) throws IOException { 866 writeToFile(fileName, WRITE_METHOD_COMPACT); 867 } 868 869 public void writeCompactXmlToFile(String fileName, String charset) throws IOException { 870 writeToFile(fileName, charset, WRITE_METHOD_COMPACT); 871 } 872 873 public void writePrettyXmlToFile(String fileName) throws IOException { 874 writeToFile(fileName, WRITE_METHOD_PRETTY); 875 } 876 877 public void writePrettyXmlToFile(String fileName, String charset) throws IOException { 878 writeToFile(fileName, charset, WRITE_METHOD_PRETTY); 879 } 880 881 public String getXmlAsString() throws IOException { 882 StringWriter writer = new StringWriter(); 883 writeXml(writer, WRITE_METHOD_SIMPLE); 884 885 return writer.getBuffer().toString(); 886 } 887 888 public String getCompactXmlAsString() throws IOException { 889 StringWriter writer = new StringWriter(); 890 writeXml(writer, WRITE_METHOD_COMPACT); 891 892 return writer.getBuffer().toString(); 893 } 894 895 public String getPrettyXmlAsString() throws IOException { 896 StringWriter writer = new StringWriter(); 897 writeXml(writer, WRITE_METHOD_PRETTY); 898 899 return writer.getBuffer().toString(); 900 } 901 902 } | Popular Tags |