1 31 32 package org.opencms.util; 33 34 import org.opencms.file.CmsObject; 35 import org.opencms.file.CmsProperty; 36 import org.opencms.file.CmsPropertyDefinition; 37 import org.opencms.file.CmsResource; 38 import org.opencms.i18n.CmsEncoder; 39 import org.opencms.main.CmsException; 40 import org.opencms.main.CmsLog; 41 42 import java.io.ByteArrayInputStream ; 43 import java.io.ByteArrayOutputStream ; 44 import java.io.UnsupportedEncodingException ; 45 import java.util.ArrayList ; 46 import java.util.List ; 47 import java.util.Properties ; 48 import java.util.StringTokenizer ; 49 import java.util.regex.Pattern ; 50 51 import org.apache.commons.logging.Log; 52 53 import org.w3c.tidy.Tidy; 54 55 67 public class CmsHtmlConverter { 68 69 70 public static final String PARAM_DISABLED = CmsStringUtil.FALSE; 71 72 73 public static final String PARAM_ENABLED = CmsStringUtil.TRUE; 74 75 76 public static final String PARAM_WORD = "cleanup"; 77 78 79 public static final String PARAM_XHTML = "xhtml"; 80 81 82 private static final Log LOG = CmsLog.getLog(CmsHtmlConverter.class); 83 84 85 String [] m_cleanupPatterns = { 86 "<o:p>.*(\\r\\n)*.*</o:p>", 87 "<o:p>.*(\\r\\n)*.*</O:p>", 88 "<\\?xml:.*(\\r\\n).*/>", 89 "<\\?xml:.*(\\r\\n).*(\\r\\n).*/\\?>", 90 "<\\?xml:.*(\\r\\n).*(\\r\\n).*/>", 91 "<\\?xml:(.*(\\r\\n)).*/\\?>", 92 "<o:SmartTagType.*(\\r\\n)*.*/>", 93 "<o:smarttagtype.*(\\r\\n)*.*/>"}; 94 95 96 Pattern [] m_clearStyle; 97 98 99 String m_encoding; 100 101 102 String [] m_replacePatterns = { 103 " ", 104 "(\\r\\n){2,}", 105 "–", 106 "(\\n){2,}", 107 "\\(\\r\\n<", 108 "\\(\\n<", 109 "\\(\\r\\n(\\ ){1,}<", 110 "\\(\\n(\\ ){1,}<", 111 "\\r\\n<span", 112 "\\n<span"}; 113 114 115 Pattern [] m_replaceStyle; 116 117 118 String [] m_replaceValues = {" ", "", "–", "", "(<", "(<", "(<", "(<", "<span", "<span"}; 119 120 121 Tidy m_tidy; 122 123 124 private int m_lineSeparatorLength; 125 126 127 private boolean m_modeEnabled; 128 129 130 private boolean m_modeWord; 131 132 133 private boolean m_modeXhtml; 134 135 140 public CmsHtmlConverter() { 141 142 init(CmsEncoder.ENCODING_UTF_8, PARAM_ENABLED); 143 } 144 145 160 public CmsHtmlConverter(String encoding, String mode) { 161 162 init(encoding, mode); 163 } 164 165 174 public static String getConversionSettings(CmsObject cms, CmsResource resource) { 175 176 String contentConversion; 178 try { 179 String resourceName = cms.getSitePath(resource); 180 CmsProperty contentConversionProperty = cms.readPropertyObject( 181 resourceName, 182 CmsPropertyDefinition.PROPERTY_CONTENT_CONVERSION, 183 true); 184 contentConversion = contentConversionProperty.getValue(); 185 } catch (CmsException e) { 186 contentConversion = CmsHtmlConverter.PARAM_DISABLED; 188 } 189 return contentConversion; 190 } 191 192 198 public static boolean isConversionEnabled(String conversionMode) { 199 200 boolean value = true; 201 if ((conversionMode == null) || (conversionMode.indexOf(PARAM_DISABLED) != -1)) { 202 value = false; 203 } 204 return value; 205 } 206 207 215 public byte[] convertToByte(byte[] htmlInput) throws UnsupportedEncodingException { 216 217 if (m_modeEnabled) { 218 return convertToByte(new String (htmlInput, m_encoding)); 220 } 221 return htmlInput; 222 } 223 224 232 public byte[] convertToByte(String htmlInput) throws UnsupportedEncodingException { 233 234 return convertToString(htmlInput).getBytes(m_encoding); 235 } 236 237 245 public byte[] convertToByteSilent(byte[] htmlInput) { 246 247 try { 248 return convertToByte(htmlInput); 249 } catch (Exception e) { 250 if (LOG.isWarnEnabled()) { 251 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); 252 } 253 return htmlInput; 254 } 255 } 256 257 265 public byte[] convertToByteSilent(String htmlInput) { 266 267 try { 268 return convertToByte(htmlInput.getBytes(m_encoding)); 269 } catch (Exception e) { 270 if (LOG.isWarnEnabled()) { 271 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); 272 } 273 try { 274 return htmlInput.getBytes(m_encoding); 275 } catch (UnsupportedEncodingException e1) { 276 if (LOG.isWarnEnabled()) { 277 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1); 278 } 279 return htmlInput.getBytes(); 280 } 281 } 282 } 283 284 292 public String convertToString(byte[] htmlInput) throws UnsupportedEncodingException { 293 294 return convertToString(new String (htmlInput, m_encoding)); 295 } 296 297 305 public String convertToString(String htmlInput) throws UnsupportedEncodingException { 306 307 if (m_modeEnabled) { 309 310 int max = m_modeWord ? 10 : 1; 312 int count = 0; 313 314 int oldSize = htmlInput.length(); 316 String workHtml = regExp(htmlInput); 317 while (count < max) { 318 count++; 319 320 if (m_modeWord) { 322 workHtml = adjustHtml(workHtml); 323 } 324 workHtml = parse(workHtml, m_encoding); 326 if (m_modeWord) { 327 workHtml = workHtml.substring(0, workHtml.length() - m_lineSeparatorLength); 329 } 330 331 if (workHtml.length() == oldSize) { 332 workHtml = regExp(workHtml); 334 break; 335 } 336 oldSize = workHtml.length(); 337 workHtml = regExp(workHtml); 338 } 339 if (LOG.isInfoEnabled()) { 340 LOG.info(Messages.get().getBundle().key( 341 Messages.LOG_PARSING_RUNS_2, 342 this.getClass().getName(), 343 new Integer (count))); 344 } 345 htmlInput = workHtml; 346 } 347 348 return htmlInput; 349 } 350 351 360 public String convertToStringSilent(byte[] htmlInput) { 361 362 try { 363 return convertToString(htmlInput); 364 } catch (Exception e) { 365 if (LOG.isWarnEnabled()) { 366 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); 367 } 368 try { 369 return new String (htmlInput, m_encoding); 370 } catch (UnsupportedEncodingException e1) { 371 if (LOG.isWarnEnabled()) { 372 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1); 373 } 374 return new String (htmlInput); 375 } 376 } 377 } 378 379 388 public String convertToStringSilent(String htmlInput) { 389 390 try { 391 return convertToString(htmlInput); 392 } catch (Exception e) { 393 if (LOG.isWarnEnabled()) { 394 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); 395 } 396 return htmlInput; 397 } 398 } 399 400 405 public String getEncoding() { 406 407 return m_encoding; 408 } 409 410 419 private String adjustHtml(String htmlInput) { 420 421 if ((htmlInput.toLowerCase().indexOf("<html>") == -1) && (htmlInput.toLowerCase().indexOf("</html>") == -1)) { 423 StringBuffer tmp = new StringBuffer (); 425 tmp.append("<html xmlns:o=\"\"><body>"); 426 tmp.append(htmlInput); 427 tmp.append("</body></html>"); 428 htmlInput = tmp.toString(); 429 } 430 return htmlInput; 431 } 432 433 441 private List extractModes(String mode) { 442 443 ArrayList extractedModes = new ArrayList (); 444 if (mode != null) { 445 StringTokenizer extract = new StringTokenizer (mode, ";"); 446 while (extract.hasMoreTokens()) { 447 String tok = extract.nextToken(); 448 extractedModes.add(tok); 449 } 450 } 451 return extractedModes; 452 } 453 454 460 private void init(String encoding, String mode) { 461 462 List modes = extractModes(mode); 464 465 if (modes.contains(PARAM_ENABLED)) { 467 m_modeEnabled = true; 468 } 469 if (modes.contains(PARAM_XHTML)) { 470 m_modeEnabled = true; 471 m_modeXhtml = true; 472 } 473 if (modes.contains(PARAM_WORD)) { 474 m_modeEnabled = true; 475 m_modeWord = true; 476 } 477 478 m_encoding = encoding; 480 481 m_lineSeparatorLength = System.getProperty("line.separator").length(); 483 484 if (m_modeEnabled) { 486 487 m_tidy = new Tidy(); 489 490 m_tidy.setXHTML(m_modeXhtml); 492 m_tidy.setWord2000(m_modeWord); 493 494 Properties additionalTags = new Properties (); 497 additionalTags.put("new-empty-tags", "o:smarttagtype"); 498 additionalTags.put("new-inline-tags", "o:smarttagtype"); 499 m_tidy.getConfiguration().addProps(additionalTags); 500 501 503 m_tidy.setInputEncoding(encoding); 505 m_tidy.setOutputEncoding(encoding); 506 507 m_tidy.setTidyMark(false); 509 m_tidy.setMakeClean(false); 511 m_tidy.setNumEntities(true); 513 m_tidy.setPrintBodyOnly(true); 515 m_tidy.setForceOutput(true); 517 m_tidy.setQuiet(true); 519 m_tidy.setShowWarnings(false); 521 m_tidy.setHideComments(false); 523 m_tidy.setBreakBeforeBR(false); 525 m_tidy.setWrapAttVals(false); 527 m_tidy.setWraplen(100); 529 m_tidy.setSpaces(0); 531 532 if (m_modeWord) { 533 m_clearStyle = new Pattern [m_cleanupPatterns.length]; 535 for (int i = 0; i < m_cleanupPatterns.length; i++) { 536 m_clearStyle[i] = Pattern.compile(m_cleanupPatterns[i]); 537 } 538 } 539 540 m_replaceStyle = new Pattern [m_replacePatterns.length]; 542 for (int i = 0; i < m_replacePatterns.length; i++) { 543 m_replaceStyle[i] = Pattern.compile(m_replacePatterns[i]); 544 } 545 } 546 } 547 548 558 private String parse(String htmlInput, String encoding) throws UnsupportedEncodingException { 559 560 ByteArrayInputStream in = new ByteArrayInputStream (htmlInput.getBytes(encoding)); 562 ByteArrayOutputStream out = new ByteArrayOutputStream (); 563 m_tidy.parse(in, out); 565 byte[] result = out.toByteArray(); 567 return new String (result, encoding); 568 } 569 570 576 private String regExp(String htmlInput) { 577 578 String parsedHtml = htmlInput.trim(); 579 580 if (m_modeWord) { 581 for (int i = 0; i < m_cleanupPatterns.length; i++) { 583 parsedHtml = m_clearStyle[i].matcher(parsedHtml).replaceAll(""); 584 } 585 } 586 587 for (int i = 0; i < m_replacePatterns.length; i++) { 589 parsedHtml = m_replaceStyle[i].matcher(parsedHtml).replaceAll(m_replaceValues[i]); 590 } 591 592 return parsedHtml; 593 } 594 } | Popular Tags |