1 package org.jahia.services.htmlparser; 2 3 import java.io.*; 4 import java.util.*; 5 6 import javax.xml.transform.*; 7 import javax.xml.transform.dom.*; 8 import javax.xml.transform.stream.*; 9 10 import org.apache.xalan.templates.*; 11 import org.cyberneko.html.parsers.*; 12 import org.jahia.utils.fileparsers.*; 13 import org.w3c.dom.*; 14 import org.jahia.utils.JahiaTools; 15 16 25 public class NekoHtmlParser implements HtmlParser { 26 27 public static String AMPERSAND = "$$$amp$$$"; 28 29 private static org.apache.log4j.Logger logger = 30 org.apache.log4j.Logger.getLogger(NekoHtmlParser.class); 31 32 public NekoHtmlParser(){} 33 34 38 public void init(HtmlParserService htmlParserService){ 39 40 } 41 42 50 public String parse(String inputString, Vector DOMVisitors){ 51 return parse(inputString,-1,DOMVisitors); 52 } 53 54 63 public String parse(String inputString, Vector DOMVisitors, 64 int siteId){ 65 if ( inputString == null || inputString.trim().equals("") ){ 66 return inputString; 67 } 68 return parse(inputString,siteId,DOMVisitors); 69 } 70 71 80 public static String parse( String input, 81 int siteId, 82 Vector DOMVisitors){ 83 84 if ( input == null || "".equals(input.trim())){ 85 return input; 86 } 87 88 String result = new String (input); 89 result = JahiaTools.replacePattern(result, "&", AMPERSAND); 90 91 ByteArrayInputStream strIn; 92 ByteArrayOutputStream strOut = new ByteArrayOutputStream(); 93 byte[] strByte = null; 94 String charSet = null; CharsetDetection charsetDet = new CharsetDetection(); 96 try { 97 strByte = org.apache.commons.io.IOUtils.toByteArray(result); 98 strIn = new ByteArrayInputStream(strByte); 99 charsetDet.charsetDetection(strIn); 100 charSet = charsetDet.getCharset(); 101 } catch ( Throwable t ){ 102 } 103 104 DOMParser domParser = new DOMParser(); 105 Document doc; 106 int size = 0; 107 try { 108 if ( charSet == null ){ 109 strByte = result.getBytes(); 110 } else { 111 strByte = result.getBytes(charSet); 112 } 113 strIn = new ByteArrayInputStream(strByte); 114 org.xml.sax.InputSource in = new org.xml.sax.InputSource (strIn); 115 domParser.setProperty("http://cyberneko.org/html/properties/default-encoding", charSet); 116 domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); 117 domParser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower"); 118 domParser.parse(in); 119 doc = domParser.getDocument(); 120 121 size = DOMVisitors.size(); 122 for (int i = 0; i < size; i++) { 123 HtmlDOMVisitor visitor = (HtmlDOMVisitor) DOMVisitors.get(i); 124 doc = visitor.parseDOM(doc); 125 } 126 127 doc.normalize(); 128 TransformerFactory tfactory = TransformerFactory.newInstance(); 129 130 Transformer serializer = tfactory.newTransformer(); 133 134 serializer.setOutputProperty(OutputKeys.METHOD, "html"); 135 serializer.setOutputProperty(OutputKeys.INDENT, "yes"); 136 if ( charSet != null ){ 137 serializer.setOutputProperty(OutputKeys.ENCODING, charSet); 138 } 139 serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT, "2"); 141 serializer.transform (new DOMSource(doc), 142 new StreamResult(strOut)); 143 if ( charSet == null ){ 144 result = strOut.toString(); 145 } else { 146 result = strOut.toString(charSet); 147 } 148 149 result = JahiaTools.text2XMLEntityRef(result, 1); 150 result = JahiaTools.replacePattern(result, AMPERSAND, "&"); 151 152 } catch ( Throwable t ){ 153 logger.debug(t); 154 return input; 155 } 156 return result; 157 } 158 } 159 | Popular Tags |