| 1 package com.ivata.groupware.web.format; 2 3 import java.io.ByteArrayInputStream ; 4 import java.io.ByteArrayOutputStream ; 5 import java.io.IOException ; 6 import java.io.InputStream ; 7 import java.io.UnsupportedEncodingException ; 8 import java.util.Iterator ; 9 10 import javax.xml.transform.TransformerConfigurationException ; 11 12 import org.apache.log4j.Logger; 13 import org.dom4j.DocumentException; 14 import org.dom4j.io.SAXReader; 15 import org.dom4j.io.XMLWriter; 16 import org.w3c.dom.Comment ; 17 import org.w3c.dom.Document ; 18 import org.w3c.dom.Element ; 19 import org.w3c.dom.EntityReference ; 20 import org.w3c.dom.NamedNodeMap ; 21 import org.w3c.dom.Node ; 22 import org.w3c.dom.NodeList ; 23 import org.w3c.dom.Text ; 24 import org.w3c.tidy.Tidy; 25 26 import com.ivata.mask.util.StringHandling; 27 import com.ivata.mask.web.format.CharacterEntityFormat; 28 import com.ivata.mask.web.format.HTMLFormat; 29 import com.ivata.mask.web.format.HTMLFormatter; 30 31 117 118 130 public class SanitizerFormat implements HTMLFormat { 131 136 private static Logger log = Logger.getLogger(SanitizerFormat.class); 137 140 private CharacterEntityFormat characterEntities = new CharacterEntityFormat(); 141 148 private boolean formattedText = false; 149 152 private HTMLFormatter formatter = new HTMLFormatter(); 153 154 160 private String imageUri = null; 161 162 167 private String imageUriAppend = null; 168 169 174 private boolean onlyBodyContents = false; 175 176 181 private String sourceName = "user input"; 182 183 186 private boolean textAtStartOfLine = true; 187 188 191 private int textNewLineCount = 0; 192 193 201 private boolean textOnly = false; 202 203 208 private Tidy tidy = new Tidy(); 209 210 215 public SanitizerFormat() { 216 tidy.setBreakBeforeBR(true); 217 tidy.setIndentContent(true); 218 tidy.setMakeClean(true); 219 tidy.setOnlyErrors(true); 220 tidy.setQuiet(true); 221 tidy.setUpperCaseAttrs(false); 222 tidy.setUpperCaseTags(false); 223 tidy.setXmlOut(true); 224 characterEntities.setReverse(true); 226 formatter.add(characterEntities); 227 } 228 229 235 private void addCloseElementAsText(final Element element, 236 final StringBuffer buffer) { 237 if(element.getTagName().equals("A")) { 239 if(element.hasAttribute("href")) { 241 notTextNewLine(); 242 buffer.append(" (" + element.getAttribute("href") + ")"); 243 } 244 } else if(element.getTagName().equals("HR") || 245 element.getTagName().equals("H1") || 246 element.getTagName().equals("H2") || 247 element.getTagName().equals("H3") || 248 element.getTagName().equals("H4") || 249 element.getTagName().equals("H5") || 250 element.getTagName().equals("H6")) { 251 addTextNewLine(buffer); 252 buffer.append("____________________________________________________________\n"); 253 } else if(element.getTagName().equals("B") || 254 element.getTagName().equals("BIG") || 255 element.getTagName().equals("EM") || 256 element.getTagName().equals("I") || 257 element.getTagName().equals("STRONG") || 258 element.getTagName().equals("U")) { 259 notTextNewLine(); 260 buffer.append("__"); 261 }else if(element.getTagName().equals("TR") || 262 element.getTagName().equals("TD") || 263 element.getTagName().equals("TH") || 264 element.getTagName().equals("P") || 265 element.getTagName().equals("BR") || 266 element.getTagName().equals("CITE") || 267 element.getTagName().equals("LI") || 268 element.getTagName().equals("BLOCKQUOTE")) { 269 addTextNewLine(buffer); 270 } 271 } 272 273 274 280 private void addOpenElementAsText(final Element element, 281 final StringBuffer buffer) { 282 if(element.getTagName().equals("BLOCKQUOTE") || 284 element.getTagName().equals("CITE") || 285 element.getTagName().equals("H1") || 286 element.getTagName().equals("H2") || 287 element.getTagName().equals("H3") || 288 element.getTagName().equals("H4") || 289 element.getTagName().equals("H5") || 290 element.getTagName().equals("H6") || 291 element.getTagName().equals("OL") || 292 element.getTagName().equals("UL") || 293 element.getTagName().equals("TABLE") || 294 element.getTagName().equals("P") || 295 element.getTagName().equals("CITE") || 296 element.getTagName().equals("BLOCKQUOTE")) { 297 addTextNewLine(buffer); 298 } else if(element.getTagName().equals("B") || 299 element.getTagName().equals("BIG") || 300 element.getTagName().equals("EM") || 301 element.getTagName().equals("I") || 302 element.getTagName().equals("STRONG") || 303 element.getTagName().equals("U")) { 304 notTextNewLine(); 305 buffer.append("__"); 306 } else if(element.getTagName().equals("LI")) { 307 addTextNewLine(buffer); 309 notTextNewLine(); 310 buffer.append(" * "); 311 } else if(element.getTagName().equals("IMG")) { 312 if(element.hasAttribute("alt")) { 314 notTextNewLine(); 315 buffer.append(formatter.format(element.getAttribute("alt").trim())); 316 } else if(element.hasAttribute("title")) { 317 notTextNewLine(); 318 buffer.append(formatter.format(element.getAttribute("title").trim())); 319 } 320 } 321 } 322 323 324 327 private void addTextNewLine(final StringBuffer buffer) { 328 if(textNewLineCount < 2) { 329 textAtStartOfLine = true; 330 buffer.append("\n"); 331 ++textNewLineCount; 332 } 333 } 334 335 345 private void addToBuffer(final Node node, 346 final StringBuffer buffer) throws IOException { 347 Element element = null; 348 if(formattedText && Element .class.isInstance(node)) { 349 element = (Element ) node; 350 NamedNodeMap attributes = element.getAttributes(); 351 addOpenElementAsText(element, buffer); 352 } else if(formattedText && Comment .class.isInstance(node)) { 353 } else if(formattedText && EntityReference .class.isInstance(node)) { 355 EntityReference entity = (EntityReference ) node; 356 buffer.append("&"); 357 buffer.append(entity.getNodeName()); 358 buffer.append(";"); 359 } else if(Text .class.isInstance(node)) { 360 Text text = (Text ) node; 361 String data = text.getData(); 362 StringBuffer dataReformatted = new StringBuffer (); 363 if(data != null) { 364 int length = data.length(); 366 boolean lastWasSpace = false; 367 boolean atStart = textAtStartOfLine; 368 for(int index = 0; index < length; ++index) { 369 if((data.charAt(index) == '\n') || 371 (data.charAt(index) == '\r') || 372 (data.charAt(index) == ' ') || 373 (data.charAt(index) == '\t')) { 374 if(!lastWasSpace) { 376 lastWasSpace = true; 377 if(!textAtStartOfLine) { 378 dataReformatted.append(' '); 379 } 380 } 381 } else { 382 lastWasSpace = false; 383 atStart = false; 384 dataReformatted.append(data.charAt(index)); 385 } 386 } 387 if(!(data = dataReformatted.toString()).equals("")) { 388 buffer.append(formatter.format(data)); 389 notTextNewLine(); 390 } 391 } 392 } else { 393 String value = node.getNodeValue(); 394 if(!StringHandling.isNullOrEmpty(value)) { 395 notTextNewLine(); 396 buffer.append(value); 397 } 398 } 399 400 if(node.hasChildNodes() && 402 ((element == null) || 403 (!element.getTagName().equals("APPLET") && 405 !element.getTagName().equals("EMBED") && 406 !element.getTagName().equals("SCRIPT")))) { 407 NodeList children = node.getChildNodes(); 408 for(int index = 0; index < children.getLength(); ++index) { 409 Node nextChild = children.item(index); 410 addToBuffer(nextChild, buffer); 411 } 412 } 413 if(element != null) { 415 addCloseElementAsText(element, buffer); 416 } 417 418 } 419 420 428 private String convertToText(final Document document) { 429 StringBuffer buffer = new StringBuffer (); 430 try { 431 addToBuffer(document, buffer); 432 } catch (IOException e) { 433 e.printStackTrace(); 434 return "ERROR: " + e.getMessage(); 435 } 436 return buffer.toString(); 437 } 438 439 449 public String format(final String hTMLTextParam) { 450 if (hTMLTextParam == null) { 451 if (log.isDebugEnabled()) { 452 log.debug("Null input received - returning null."); 453 } 454 return null; 455 } 456 if (hTMLTextParam.trim().length() == 0) { 457 if (log.isDebugEnabled()) { 458 log.debug("Empty input received - returning input unchanged."); 459 } 460 return hTMLTextParam; 461 } 462 String lowerCaseText = hTMLTextParam.toLowerCase(); 465 boolean hasHTMLTag = lowerCaseText.indexOf("<HTML") != -1; 466 String hTMLText; 467 if (!hasHTMLTag) { 468 if (log.isDebugEnabled()) { 469 log.debug("No HTML tag found - surrounding everything with HTML and BODY."); 470 } 471 StringBuffer newHTMLText = new StringBuffer (); 472 newHTMLText.append("<HTML><head><title></title></head><body>"); 473 newHTMLText.append(hTMLTextParam); 474 newHTMLText.append("</body></HTML>"); 475 hTMLText = newHTMLText.toString(); 476 } else { 477 hTMLText = hTMLTextParam; 478 } 479 480 481 if (hTMLText.indexOf("<%") != -1) { 483 hTMLText = hTMLText.replaceAll("<%", "<%"); 484 } 485 if (hTMLText.indexOf("%>") != -1) { 486 hTMLText = hTMLText.replaceAll("%>", "%>"); 487 } 488 489 InputStream inStream = new ByteArrayInputStream (hTMLText.getBytes()); 490 Document document = tidy.parseDOM(inStream, null); 491 if (textOnly) { 492 if (log.isDebugEnabled()) { 493 log.debug("Converting document to text."); 494 } 495 return convertToText(document); 496 } else { 497 ByteArrayOutputStream outStream = new ByteArrayOutputStream (); 498 tidy.pprint(document, outStream); 499 500 if (onlyBodyContents 501 && (outStream.toString().trim().length() > 0)) { 502 SAXReader saxReader = new SAXReader(); 503 String text = outStream.toString(); 504 if (text.indexOf("−") != -1) { 506 text = text.replaceAll("−", "-"); 507 } 508 inStream = new ByteArrayInputStream (text.getBytes()); 509 org.dom4j.Document dom4jDocument; 510 try { 511 dom4jDocument = saxReader.read(inStream); 512 } catch (DocumentException e) { 513 log.error("Error (" 514 + e.getClass().getName() 515 + ") reading the document back in after Tidy:\n" 516 + outStream.toString(), 517 e); 518 throw new RuntimeException (e); 519 } 520 org.dom4j.Element rootElement = dom4jDocument.getRootElement(); 521 org.dom4j.Element bodyElement = rootElement.element("body"); 522 if (bodyElement == null) { 523 return null; 524 } 525 outStream = new ByteArrayOutputStream (); 526 XMLWriter writer; 527 try { 528 writer = new XMLWriter(outStream, 529 new org.dom4j.io.OutputFormat("", true)); 530 } catch (UnsupportedEncodingException e) { 531 log.error("Error (" 532 + e.getClass().getName() 533 + ") creating the document to write back out.", 534 e); 535 throw new RuntimeException (e); 536 } 537 Iterator bodyNodeIterator = bodyElement.nodeIterator(); 538 while(bodyNodeIterator.hasNext()) { 539 try { 540 writer.write((org.dom4j.Node)bodyNodeIterator.next()); 541 } catch (IOException e) { 542 log.error("Error (" 543 + e.getClass().getName() 544 + ") writing the body back out:\n" 545 + bodyElement.asXML(), 546 e); 547 throw new RuntimeException (e); 548 } 549 } 550 } 551 return outStream.toString(); 552 } 553 } 554 555 556 564 public final String getSourceName() { 565 return sourceName; 566 } 567 568 578 public boolean isFormattedText() { 579 return formattedText; 580 } 581 582 590 public boolean isTextOnly() { 591 return textOnly; 592 } 593 594 597 private void notTextNewLine() { 598 textNewLineCount = 0; 599 textAtStartOfLine = false; 600 } 601 611 public final void setFormattedText(final boolean formattedText) { 612 this.formattedText = formattedText; 613 } 614 615 625 public final void setImageUri(final String imageUri) { 626 this.imageUri = imageUri; 627 } 628 629 638 public final void setImageUriAppend(final String imageUriAppend) { 639 this.imageUriAppend = imageUriAppend; 640 } 641 642 651 public final void setOnlyBodyContents(final boolean onlyChildren) { 652 this.onlyBodyContents = onlyChildren; 653 } 654 655 664 public final void setSourceName(final String sourceName) { 665 this.sourceName = sourceName; 666 } 667 668 677 public final void setTextOnly(final boolean textOnly) { 678 this.textOnly = textOnly; 679 } 680 } | Popular Tags |