1 package org.jahia.clipbuilder.html.web.html.Impl; 2 3 import java.io.*; 4 import org.jahia.clipbuilder.html.web.html.*; 5 import org.apache.xerces.parsers.*; 6 import org.cyberneko.html.*; 7 import org.w3c.dom.*; 8 import org.xml.sax.*; 9 10 15 public class DefaultHTMLParser implements HTMLParser { 16 private HTMLConfiguration configuration; 17 private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(DefaultHTMLParser.class); 18 19 20 23 public DefaultHTMLParser() { 24 init(); 25 } 26 27 28 29 34 public void setConfiguration(HTMLConfiguration configuration) { 35 logger.debug("[ Set configutation ]"); 36 37 38 this.configuration = configuration; 39 } 40 41 42 47 public HTMLConfiguration getConfiguration() { 48 49 return configuration; 50 } 51 52 53 61 public Document parse(String html) throws IOException, SAXException { 62 63 Document htmlDocument = null; 64 try { 65 java.io.InputStream in = new java.io.StringBufferInputStream (html); 67 68 DOMParser parser = new DOMParser(configuration); 69 parser.parse(new InputSource(in)); 70 htmlDocument = parser.getDocument(); 71 logger.debug("[ Parsing finished. ]"); 72 } 73 catch (IOException ex) { 74 ex.printStackTrace(); 75 } 76 catch (SAXException ex) { 77 ex.printStackTrace(); 78 } 79 80 return htmlDocument; 81 } 82 83 84 85 88 private void init() { 89 configuration = new org.cyberneko.html.HTMLConfiguration(); 91 configuration.setFeature("http://cyberneko.org/html/features/augmentations", true); 92 configuration.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); 93 configuration.setFeature("http://cyberneko.org/html/features/report-errors", false); 94 configuration.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false); 95 configuration.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", false); 96 configuration.setFeature("http://apache.org/xml/features/scanner/notify-builtin-refs", true); 97 configuration.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset",true); 98 99 configuration.setFeature("http://cyberneko.org/html/features/scanner/script/strip-comment-delims", true); 101 configuration.setFeature("http://cyberneko.org/html/features/scanner/style/strip-comment-delims", true); 102 configuration.setFeature("http://xml.org/sax/features/namespaces", false); 104 configuration.setFeature("http://cyberneko.org/html/features/insert-namespaces", false); 105 configuration.setFeature("http://cyberneko.org/html/features/override-namespaces", false); 106 107 108 } 109 110 } 111 | Popular Tags |