1 package org.jahia.services.htmlparser; 2 3 import java.io.*; 4 import java.util.*; 5 6 import javax.xml.parsers.*; 7 import javax.xml.transform.*; 8 import javax.xml.transform.dom.*; 9 import javax.xml.transform.stream.*; 10 11 import org.jahia.registries.*; 12 import org.jahia.settings.*; 13 import org.jahia.utils.*; 14 import org.jahia.utils.fileparsers.*; 15 import org.jahia.utils.properties.*; 16 import org.w3c.dom.*; 17 import org.w3c.tidy.*; 18 import org.xml.sax.*; 19 20 29 public class TidyHtmlParser implements HtmlParser { 30 31 private static org.apache.log4j.Logger logger = 32 org.apache.log4j.Logger.getLogger(TidyHtmlParser.class); 33 34 public static String AMPERSAND = "$$$amp$$$"; 35 public static String AMPERSAND_SECONDPASS = "$$$amp_secondpass$$$"; 36 public static String TIDYERRORS_TAG = "TIDYERRORS"; 37 38 private static Vector newInlineTags = new Vector(); 39 private static Vector newBlockLevelTags = new Vector(); 40 private static Vector unrecognizedTags = new Vector(); 41 42 private Properties config = new Properties(); 43 44 public TidyHtmlParser(){} 45 46 public TidyHtmlParser(Properties config){ 47 this.config = config; 48 if ( this.config == null ){ 49 this.config = new Properties(); 50 } 51 } 52 53 57 public void init(HtmlParserService htmlParserService){ 58 59 SettingsBean settings = htmlParserService.getSettingsBean(); 60 if ( settings == null ){ 61 return; 62 } 63 String fileName = settings.getPropertiesFile().getProperty("tidyConfig"); 64 if ( fileName == null || "".equals(fileName.trim()) ){ 65 fileName = "tidy.properties"; 66 } 67 StringBuffer buff = new StringBuffer (settings.getJahiaEtcDiskPath()); 68 buff.append(File.separator); 69 buff.append("config"); 70 buff.append(File.separator); 71 buff.append(fileName); 72 try { 73 PropertiesManager propManager = new PropertiesManager(buff.toString()); 74 this.config = propManager.getPropertiesObject(); 75 } catch ( Throwable t ){ 76 logger.debug("Error loading tidy config file, use default settings ",t); 77 } 78 if ( this.config == null ){ 79 this.config = new Properties(); 80 } 81 } 82 83 91 public String parse(String inputString, Vector DOMVisitors){ 92 return parse(inputString,-1,config,DOMVisitors); 93 } 94 95 104 public String parse(String inputString, Vector DOMVisitors, 105 int siteId){ 106 if ( inputString == null || inputString.trim().equals("") ){ 107 return inputString; 108 } 109 return parse(inputString,siteId,config,DOMVisitors); 110 } 111 112 122 public static String parse( String input, 123 int siteId, 124 Properties tidyConfig, 125 Vector DOMVisitors){ 126 127 if (input == null || "".equals(input.trim())) { 128 return input; 129 } 130 String result = new String (input); 131 result = JahiaTools.replacePattern(result, "&", AMPERSAND); 132 133 ByteArrayInputStream strIn; 134 ByteArrayOutputStream strOut; 135 BufferedInputStream urlIn; 136 Tidy tidy = new Tidy(); 137 138 Properties config = (Properties) tidyConfig.clone(); 139 String val = tidyConfig.getProperty(TidyConfig.NEW_BLOCK_LEVEL_TAGS); 140 if (val == null) { 141 val = ""; 142 } 143 String tag = null; 144 int size = newBlockLevelTags.size(); 145 for (int i = 0; i < size; i++) { 146 tag = (String ) newBlockLevelTags.get(i); 147 if (val.length() == 0) { 148 val = tag; 149 } else { 150 val += "," + tag; 151 } 152 } 153 config.setProperty(TidyConfig.NEW_BLOCK_LEVEL_TAGS, val); 154 155 val = config.getProperty(TidyConfig.NEW_INLINE_TAGS); 156 if (val == null) { 157 val = ""; 158 } 159 size = newInlineTags.size(); 160 for (int i = 0; i < size; i++) { 161 tag = (String ) newInlineTags.get(i); 162 if (val.length() == 0) { 163 val = tag; 164 } else { 165 val += "," + tag; 166 } 167 } 168 169 config.setProperty(TidyConfig.NEW_INLINE_TAGS, val); 170 171 byte[] strByte = null; 173 String charSet = null; CharsetDetection charsetDet = new CharsetDetection(); 175 try { 176 strByte = org.apache.commons.io.IOUtils.toByteArray(result); 177 strIn = new ByteArrayInputStream(strByte); 178 charsetDet.charsetDetection(strIn); 179 charSet = charsetDet.getCharset(); 180 181 if ((config.getProperty(TidyConfig.CHAR_ENCODING) == null) 182 && "UTF-8".equalsIgnoreCase(charSet)) { 183 config.setProperty(TidyConfig.CHAR_ENCODING, "utf8"); 184 } 185 } catch (Throwable t) { 186 } 187 188 tidy.setConfigurationFromProps(config); 189 190 try { 191 if (charSet == null) { 192 strByte = result.getBytes(); 193 } else { 194 strByte = result.getBytes(charSet); 195 } 196 strIn = new ByteArrayInputStream(strByte); 197 strOut = new ByteArrayOutputStream(); 198 ByteArrayOutputStream strErr = new ByteArrayOutputStream(); 199 tidy.setErrout(new PrintWriter(strErr, true)); 200 tidy.setShowWarnings(false); 201 tidy.parse(strIn, strOut); 202 strIn.reset(); 203 String tmpValue = null; 204 if (charSet == null) { 205 tmpValue = strOut.toString(); 206 } else { 207 tmpValue = strOut.toString(charSet); 208 } 209 tmpValue = JahiaTools.replacePattern(tmpValue, "&", 210 AMPERSAND_SECONDPASS); 211 212 if (tmpValue == null) { 213 tmpValue = ""; 214 } 215 if (!"".equals(tmpValue.trim())) { 216 if (charSet == null) { 217 strByte = tmpValue.getBytes(); 218 } else { 219 strByte = tmpValue.getBytes(charSet); 220 } 221 strIn = new ByteArrayInputStream(strByte); 222 231 DocumentBuilderFactory dfactory = DocumentBuilderFactory. 232 newInstance(); 233 234 EntityResolver et = null; 235 try { 236 et = ServicesRegistry.getInstance(). 237 getJahiaWebAppsDeployerService().getDtdEntityResolver(); 238 } catch (Throwable t) { 239 } 240 DocumentBuilder docBuilder = dfactory.newDocumentBuilder(); 241 if (et != null) { 242 docBuilder.setEntityResolver(et); 243 } 244 Document doc = docBuilder.parse(strIn); 245 246 TagRemover tagRemover = 247 new TagRemover(); 248 249 synchronized (unrecognizedTags) { 250 size = unrecognizedTags.size(); 251 for (int i = 0; i < size; i++) { 252 tagRemover.addTag((String ) unrecognizedTags.get(i)); 253 } 254 } 255 doc = tagRemover.parseDOM(doc); 257 258 size = DOMVisitors.size(); 259 for (int i = 0; i < size; i++) { 260 HtmlDOMVisitor visitor = (HtmlDOMVisitor) DOMVisitors.get(i); 261 doc = visitor.parseDOM(doc); 262 } 263 264 doc.normalize(); 265 266 TransformerFactory tfactory = TransformerFactory.newInstance(); 267 268 Transformer serializer = tfactory.newTransformer(); 271 272 serializer.setOutputProperty(OutputKeys.METHOD, "html"); 273 serializer.setOutputProperty(OutputKeys.INDENT, "yes"); 274 if (charSet != null) { 275 serializer.setOutputProperty(OutputKeys.ENCODING, charSet); 276 } 277 strOut.reset(); 280 serializer.transform(new DOMSource(doc), 281 new StreamResult(strOut)); 282 283 if (tidy.getParseErrors() > 0) { 284 result = "<TIDYERRORS>\n" + strErr.toString() + 285 "</TIDYERRORS>"; 286 } else { 287 if (charSet == null) { 288 result = strOut.toString(); 289 } else { 290 result = strOut.toString(charSet); 291 } 292 } 293 result = JahiaTools.replacePattern(result, AMPERSAND_SECONDPASS, 294 "&"); 295 result = JahiaTools.text2XMLEntityRef(result, 1); 296 result = JahiaTools.replacePattern(result, AMPERSAND, "&"); 297 298 } else if (tidy.getParseErrors() > 0) { 299 String err = strErr.toString(); 300 result = "<TIDYERRORS>\n" + err + "</TIDYERRORS>"; 301 if (err.indexOf("is not recognized!") != -1) { 302 err = JahiaTools.replacePatternIgnoreCase(err.toLowerCase(), 303 " - error: ", "@@@"); 304 String [] errors = org.jahia.utils.JahiaTools.getTokens( 305 err, "@@@"); 306 if (errors.length > 0) { 307 String token = ""; 308 ArrayList tags = new ArrayList(); 309 tag = null; 310 String newInput = input; 311 int pos = -1; 312 for (int i = 0; i < errors.length; i++) { 313 token = errors[i]; 314 pos = token.indexOf(" is not recognized!"); 315 if (pos != -1) { 316 try { 317 tag = token.substring(0, pos); 318 if (!tag.startsWith("<")) { 319 synchronized (unrecognizedTags) { 321 if (unrecognizedTags.contains(tag)) { 322 continue; 323 } else { 324 unrecognizedTags.add(tag); 325 newInlineTags.add(tag); 326 } 327 } 328 } else { 329 tag = tag.substring(1, tag.length() - 1); 330 synchronized (unrecognizedTags) { 331 if (unrecognizedTags.contains(tag)) { 332 continue; 333 } else { 334 unrecognizedTags.add(tag); 335 newBlockLevelTags.add(tag); 336 } 337 } 338 } 339 } catch (Throwable t) { 340 } 341 } 342 } 343 result = parse(input, siteId, tidyConfig, 344 DOMVisitors); 345 } 346 } 347 } 348 } catch (Exception e) { 349 e.printStackTrace(); 350 return input; 351 } 352 return result; 353 } 354 355 } 356 | Popular Tags |