SanitizerFormat


1   package com.ivata.groupware.web.format;
2   
3   import java.io.ByteArrayInputStream  ;
4   import java.io.ByteArrayOutputStream  ;
5   import java.io.IOException  ;
6   import java.io.InputStream  ;
7   import java.io.UnsupportedEncodingException  ;
8   import java.util.Iterator  ;
9   
10  import javax.xml.transform.TransformerConfigurationException  ;
11  
12  import org.apache.log4j.Logger;
13  import org.dom4j.DocumentException;
14  import org.dom4j.io.SAXReader;
15  import org.dom4j.io.XMLWriter;
16  import org.w3c.dom.Comment  ;
17  import org.w3c.dom.Document  ;
18  import org.w3c.dom.Element  ;
19  import org.w3c.dom.EntityReference  ;
20  import org.w3c.dom.NamedNodeMap  ;
21  import org.w3c.dom.Node  ;
22  import org.w3c.dom.NodeList  ;
23  import org.w3c.dom.Text  ;
24  import org.w3c.tidy.Tidy;
25  
26  import com.ivata.mask.util.StringHandling;
27  import com.ivata.mask.web.format.CharacterEntityFormat;
28  import com.ivata.mask.web.format.HTMLFormat;
29  import com.ivata.mask.web.format.HTMLFormatter;
30  
31  /*
32   * Copyright (c) 2001 - 2005 ivata limited.
33   * All rights reserved.
34   * -----------------------------------------------------------------------------
35   * ivata groupware may be redistributed under the GNU General Public
36   * License as published by the Free Software Foundation;
37   * version 2 of the License.
38   *
39   * These programs are free software; you can redistribute them and/or
40   * modify them under the terms of the GNU General Public License
41   * as published by the Free Software Foundation; version 2 of the License.
42   *
43   * These programs are distributed in the hope that they will be useful,
44   * but WITHOUT ANY WARRANTY; without even the implied warranty of
45   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
46   *
47   * See the GNU General Public License in the file LICENSE.txt for more
48   * details.
49   *
50   * If you would like a copy of the GNU General Public License write to
51   *
52   * Free Software Foundation, Inc.
53   * 59 Temple Place - Suite 330
54   * Boston, MA 02111-1307, USA.
55   *
56   *
57   * To arrange commercial support and licensing, contact ivata at
58   *                  http://www.ivata.com/contact.jsp
59   * -----------------------------------------------------------------------------
60   * $Log: SanitizerFormat.java,v $
61   * Revision 1.4  2005/04/30 13:07:31  colinmacleod
62   * Added EntityResolver so you don't need an
63   * internet connection.
64   *
65   * Revision 1.3  2005/04/10 20:32:00  colinmacleod
66   * Added new themes.
67   * Changed id type to String.
68   * Changed i tag to em and b tag to strong.
69   * Improved PicoContainerFactory with NanoContainer scripts.
70   *
71   * Revision 1.2  2005/04/09 17:19:42  colinmacleod
72   * Changed copyright text to GPL v2 explicitly.
73   *
74   * Revision 1.1.1.1  2005/03/10 17:49:53  colinmacleod
75   * Restructured ivata op around Hibernate/PicoContainer.
76   * Renamed ivata groupware.
77   *
78   * Revision 1.7  2004/11/03 15:54:32  colinmacleod
79   * Changed todo comments to TODO: all caps.
80   *
81   * Revision 1.6  2004/09/30 14:58:06  colinmacleod
82   * Bugfixes for documents with no surrounding tags.
83   * Added log4j.
84   *
85   * Revision 1.5  2004/08/01 11:54:07  colinmacleod
86   * Removed ivata groupware custom HTML parser in favor of JTidy.
87   *
88   * Revision 1.4  2004/07/13 19:48:08  colinmacleod
89   * Moved project to POJOs from EJBs.
90   * Applied PicoContainer to services layer (replacing session EJBs).
91   * Applied Hibernate to persistence layer (replacing entity EJBs).
92   *
93   * Revision 1.3  2004/03/21 21:16:27  colinmacleod
94   * Shortened name to ivata op.
95   *
96   * Revision 1.2  2004/02/01 22:07:30  colinmacleod
97   * Added full names to author tags
98   *
99   * Revision 1.1.1.1  2004/01/27 20:58:30  colinmacleod
100  * Moved ivata openportal to SourceForge..
101  *
102  * Revision 1.2  2003/10/15 14:15:36  colin
103  * fixing for XDoclet
104  *
105  * Revision 1.2  2003/05/06 13:42:25  peter
106  * added embedded IMG attachments functionality
107  *
108  * Revision 1.1  2003/02/24 19:33:33  colin
109  * moved to jsp
110  *
111  * Revision 1.2  2003/02/04 17:43:46  colin
112  * copyright notice
113  *
114  * Revision 1.1  2002/08/10 21:17:48  colin
115  * first version of HTML sanitizer/parser to clean up HTML code
116  */
117 
118 /**
119  * <p>
120  * This class uses the parser defined in {@linkcom.ivata.groupware.web.parser} to
121  * tidy up the HTML and posibly convert it to text-only.
122  * </p>
123  *
124  * @since 2002-08-10
125  * @author Colin MacLeod
126  * <a HREF='mailto:colin.macleod@ivata.com'>colin.macleod@ivata.com</a>
127  * @version $Revision: 1.4 $
128  * @see com.ivata.groupware.web.parser
129  */
130 public class SanitizerFormat implements HTMLFormat {
131     /**
132      * <p>
133      * <strong>Log4J</strong> logger.
134      * </p>
135      */
136     private static Logger log = Logger.getLogger(SanitizerFormat.class);
137     /**
138      * <p>Used to convert character entities back again in text mode.</p>
139      */
140     private CharacterEntityFormat characterEntities = new CharacterEntityFormat();
141     /**
142      * <p>
143      * Stores whether or not <code>format</code> should return just plain
144      * text. If <code>true</code>, only text is returned, otherwise formatted
145      * HTML is returned.
146      * </p>
147      */
148     private boolean formattedText = false;
149     /**
150      * <p>Used to convert character entities back again in text mode.</p>
151      */
152     private HTMLFormatter formatter = new HTMLFormatter();
153 
154     /**
155      * <p>
156      * Contains uri to prepend when src attribute of an image begins with
157      * <em>cid: </em>- it's and embedded attachment
158      * </p>
159      */
160     private String   imageUri = null;
161 
162     /**
163      * <p>
164      * The information to append to rewritten uris of embedded attachments
165      * </p>
166      */
167     private String   imageUriAppend = null;
168 
169     /**
170      * <p>
171      * If <code>true</code> then only the contents of the body tag are returned.
172      * </p>
173      */
174     private boolean onlyBodyContents = false;
175 
176     /**
177      * <p>
178      * Stores name of the source or file to output for debugging.
179      * </p>
180      */
181     private String   sourceName = "user input";
182 
183     /**
184      * <p>Remember whether or not we're at the start of a line in a text file.</p>
185      */
186     private boolean textAtStartOfLine = true;
187 
188     /**
189      * <p>Remember how many newlines we've made in a text file.</p>
190      */
191     private int textNewLineCount = 0;
192 
193     /**
194      * <p>
195      * Stores whether or not <code>format</code> should return just plain
196      * text with line feeds and converted horizonal rule. If <code>true</code>,
197      * fomratted text is returned, otherwise formatted
198      * HTML is returned.
199      * </p>
200      */
201     private boolean textOnly = false;
202 
203     /**
204      * <p>
205      * This tidy instance does all the hard work.
206      * </p>
207      */
208     private Tidy tidy = new Tidy();
209 
210     /**
211      * <p>
212      * Default constructor.
213      * </p>
214      */
215     public SanitizerFormat() {
216         tidy.setBreakBeforeBR(true);
217         tidy.setIndentContent(true);
218         tidy.setMakeClean(true);
219         tidy.setOnlyErrors(true);
220         tidy.setQuiet(true);
221         tidy.setUpperCaseAttrs(false);
222         tidy.setUpperCaseTags(false);
223         tidy.setXmlOut(true);
224         // these objects are used to convert character entities back again
225         characterEntities.setReverse(true);
226         formatter.add(characterEntities);
227     }
228 
229     /**
230      * <p>Convert an closing tag element to text.</p>
231      *
232      * @param element element which is closed.
233      * @param buffer <code>PrintWriter</code> to send the results to.
234      */
235     private void addCloseElementAsText(final Element   element,
236             final StringBuffer   buffer) {
237         // follow table cells with a tab
238         if(element.getTagName().equals("A")) {
239             // see what the link was
240             if(element.hasAttribute("href")) {
241                 notTextNewLine();
242                 buffer.append(" (" + element.getAttribute("href") + ")");
243             }
244         } else if(element.getTagName().equals("HR") ||
245                   element.getTagName().equals("H1") ||
246                   element.getTagName().equals("H2") ||
247                   element.getTagName().equals("H3") ||
248                   element.getTagName().equals("H4") ||
249                   element.getTagName().equals("H5") ||
250                   element.getTagName().equals("H6")) {
251             addTextNewLine(buffer);
252             buffer.append("____________________________________________________________\n");
253         } else if(element.getTagName().equals("B") ||
254                   element.getTagName().equals("BIG") ||
255                   element.getTagName().equals("EM") ||
256                   element.getTagName().equals("I") ||
257                   element.getTagName().equals("STRONG") ||
258                   element.getTagName().equals("U")) {
259             notTextNewLine();
260             buffer.append("__");
261         }else if(element.getTagName().equals("TR") ||
262                  element.getTagName().equals("TD") ||
263                  element.getTagName().equals("TH") ||
264                  element.getTagName().equals("P") ||
265                  element.getTagName().equals("BR") ||
266                  element.getTagName().equals("CITE") ||
267                  element.getTagName().equals("LI") ||
268                  element.getTagName().equals("BLOCKQUOTE")) {
269             addTextNewLine(buffer);
270         }
271     }
272 
273 
274     /**
275      * <p>Convert an open tag element to text.</p>
276      *
277      * @param element element which is opened.
278      * @param buffer <code>PrintWriter</code> to send the results to.
279      */
280     private void addOpenElementAsText(final Element   element,
281             final StringBuffer   buffer) {
282         // precede some tags with a character in read-only mode
283         if(element.getTagName().equals("BLOCKQUOTE") ||
284            element.getTagName().equals("CITE") ||
285            element.getTagName().equals("H1") ||
286            element.getTagName().equals("H2") ||
287            element.getTagName().equals("H3") ||
288            element.getTagName().equals("H4") ||
289            element.getTagName().equals("H5") ||
290            element.getTagName().equals("H6") ||
291            element.getTagName().equals("OL") ||
292            element.getTagName().equals("UL") ||
293            element.getTagName().equals("TABLE") ||
294            element.getTagName().equals("P") ||
295            element.getTagName().equals("CITE") ||
296            element.getTagName().equals("BLOCKQUOTE")) {
297             addTextNewLine(buffer);
298         } else if(element.getTagName().equals("B") ||
299                   element.getTagName().equals("BIG") ||
300                   element.getTagName().equals("EM") ||
301                   element.getTagName().equals("I") ||
302                   element.getTagName().equals("STRONG") ||
303                   element.getTagName().equals("U")) {
304             notTextNewLine();
305             buffer.append("__");
306         } else if(element.getTagName().equals("LI")) {
307             // TODO: work buffer somehow if it is ol or ul
308             addTextNewLine(buffer);
309             notTextNewLine();
310             buffer.append(" * ");
311         } else if(element.getTagName().equals("IMG")) {
312             // see if there is an alternate text for this image
313             if(element.hasAttribute("alt")) {
314                 notTextNewLine();
315                 buffer.append(formatter.format(element.getAttribute("alt").trim()));
316             } else if(element.hasAttribute("title")) {
317                 notTextNewLine();
318                 buffer.append(formatter.format(element.getAttribute("title").trim()));
319             }
320         }
321     }
322 
323 
324     /**
325      * <p>Write a text new line.</p>
326      */
327     private void addTextNewLine(final StringBuffer   buffer) {
328         if(textNewLineCount < 2) {
329             textAtStartOfLine = true;
330             buffer.append("\n");
331             ++textNewLineCount;
332         }
333     }
334 
335     /**
336      * <p>Add a string representation of the given element to the buffer.</p>
337      *
338      * @param node node to add, and to add all of the children for.
339      * @param onlyChildren if <code>true</code> then only the children of this
340      * node are added, otherwise the node itself is added too.
341      * @param buffer <code>PrintWriter</code> to send the results to.
342      * @param textOnly if <code>true</code>, only text is returned, otherwise
343      * formatted HTML is returned.
344      */
345     private void addToBuffer(final Node   node,
346             final StringBuffer   buffer) throws IOException   {
347         Element   element = null;
348         if(formattedText && Element  .class.isInstance(node)) {
349             element = (Element  ) node;
350             NamedNodeMap   attributes = element.getAttributes();
351             addOpenElementAsText(element, buffer);
352         } else if(formattedText && Comment  .class.isInstance(node)) {
353             // ignore comments in text mode
354         } else if(formattedText && EntityReference  .class.isInstance(node)) {
355             EntityReference   entity = (EntityReference  ) node;
356             buffer.append("&");
357             buffer.append(entity.getNodeName());
358             buffer.append(";");
359         } else if(Text  .class.isInstance(node)) {
360             Text   text = (Text  ) node;
361             String   data = text.getData();
362             StringBuffer   dataReformatted = new StringBuffer  ();
363             if(data != null) {
364                 // strip buffer any funny characters and double spaces
365                 int length = data.length();
366                 boolean lastWasSpace = false;
367                 boolean atStart = textAtStartOfLine;
368                 for(int index = 0; index < length; ++index) {
369                     // newlines, carriage returns and tabs are all spaces now
370                     if((data.charAt(index) == '\n') ||
371                        (data.charAt(index) == '\r') ||
372                        (data.charAt(index) == ' ') ||
373                        (data.charAt(index) == '\t')) {
374                         // ignore double spaces
375                         if(!lastWasSpace) {
376                             lastWasSpace = true;
377                             if(!textAtStartOfLine) {
378                                 dataReformatted.append(' ');
379                             }
380                         }
381                     } else {
382                         lastWasSpace = false;
383                         atStart = false;
384                         dataReformatted.append(data.charAt(index));
385                     }
386                 }
387                 if(!(data = dataReformatted.toString()).equals("")) {
388                     buffer.append(formatter.format(data));
389                     notTextNewLine();
390                 }
391             }
392         } else {
393             String   value = node.getNodeValue();
394             if(!StringHandling.isNullOrEmpty(value)) {
395                 notTextNewLine();
396                 buffer.append(value);
397             }
398         }
399 
400         // if that doesn't work, try the children
401         if(node.hasChildNodes() &&
402            ((element == null) ||
403             // these are the tags to ignore the contents of in text mode
404             (!element.getTagName().equals("APPLET") &&
405              !element.getTagName().equals("EMBED") &&
406              !element.getTagName().equals("SCRIPT")))) {
407             NodeList   children = node.getChildNodes();
408             for(int index = 0; index < children.getLength(); ++index) {
409                 Node   nextChild = children.item(index);
410                 addToBuffer(nextChild, buffer);
411             }
412         }
413         // in text only mode, certain elements are followed by a special character
414         if(element != null) {
415             addCloseElementAsText(element, buffer);
416         }
417 
418     }
419 
420     /**
421      * <p>
422      * Internal method which converts the <strong>HTML</strong> into plain text.
423      * </p>
424      *
425      * @param element Root <strong>HTML</strong> element to be converted.
426      * @return Plain text matching the <strong>HTML</strong>.
427      */
428     private String   convertToText(final Document   document) {
429         StringBuffer   buffer = new StringBuffer  ();
430         try {
431             addToBuffer(document, buffer);
432         } catch (IOException   e) {
433             e.printStackTrace();
434             return "ERROR: " + e.getMessage();
435         }
436         return buffer.toString();
437     }
438 
439     /**
440      * <p>
441      * Format the string given in <code>hTMLText</code> and clean up the
442      * syntax of the HTML.
443      * </p>
444      *
445      * @param hTMLTextParam
446      *            the text to truncate.
447      * @throws TransformerConfigurationException
448      */
449     public String   format(final String   hTMLTextParam) {
450         if (hTMLTextParam == null) {
451             if (log.isDebugEnabled()) {
452                 log.debug("Null input received - returning null.");
453             }
454             return null;
455         }
456         if (hTMLTextParam.trim().length() == 0) {
457             if (log.isDebugEnabled()) {
458                 log.debug("Empty input received - returning input unchanged.");
459             }
460             return hTMLTextParam;
461         }
462         // basic sanity check - if there is no HTML tag, assume we only have
463         // body content.
464         String   lowerCaseText = hTMLTextParam.toLowerCase();
465         boolean hasHTMLTag = lowerCaseText.indexOf("<HTML") != -1;
466         String   hTMLText;
467         if (!hasHTMLTag) {
468             if (log.isDebugEnabled()) {
469                 log.debug("No HTML tag found - surrounding everything with HTML and BODY.");
470             }
471             StringBuffer   newHTMLText = new StringBuffer  ();
472             newHTMLText.append("<HTML><head><title></title></head><body>");
473             newHTMLText.append(hTMLTextParam);
474             newHTMLText.append("</body></HTML>");
475             hTMLText = newHTMLText.toString();
476         } else {
477             hTMLText = hTMLTextParam;
478         }
479 
480 
481         // TOTAL HACK to convert JSP tags to entities
482         if (hTMLText.indexOf("<%") != -1) {
483             hTMLText = hTMLText.replaceAll("<%", "&lt;%");
484         }
485         if (hTMLText.indexOf("%>") != -1) {
486             hTMLText = hTMLText.replaceAll("%>", "%&gt;");
487         }
488 
489         InputStream   inStream = new ByteArrayInputStream  (hTMLText.getBytes());
490         Document   document = tidy.parseDOM(inStream, null);
491         if (textOnly) {
492             if (log.isDebugEnabled()) {
493                 log.debug("Converting document to text.");
494             }
495             return convertToText(document);
496         } else {
497             ByteArrayOutputStream   outStream = new ByteArrayOutputStream  ();
498             tidy.pprint(document, outStream);
499 
500             if (onlyBodyContents
501                     && (outStream.toString().trim().length() > 0)) {
502                 SAXReader saxReader = new SAXReader();
503                 String   text = outStream.toString();
504                 // EVEN BIGGER HACK to remove previous over-zealous dash replacement
505                 if (text.indexOf("&minus;") != -1) {
506                     text = text.replaceAll("&minus;", "-");
507                 }
508                 inStream = new ByteArrayInputStream  (text.getBytes());
509                 org.dom4j.Document dom4jDocument;
510                 try {
511                     dom4jDocument = saxReader.read(inStream);
512                 } catch (DocumentException e) {
513                     log.error("Error ("
514                             + e.getClass().getName()
515                             + ") reading the document back in after Tidy:\n"
516                             + outStream.toString(),
517                             e);
518                     throw new RuntimeException  (e);
519                 }
520                 org.dom4j.Element rootElement = dom4jDocument.getRootElement();
521                 org.dom4j.Element bodyElement = rootElement.element("body");
522                 if (bodyElement == null) {
523                     return null;
524                 }
525                 outStream = new ByteArrayOutputStream  ();
526                 XMLWriter writer;
527                 try {
528                     writer = new XMLWriter(outStream,
529                             new org.dom4j.io.OutputFormat("", true));
530                 } catch (UnsupportedEncodingException   e) {
531                     log.error("Error ("
532                             + e.getClass().getName()
533                             + ") creating the document to write back out.",
534                             e);
535                     throw new RuntimeException  (e);
536                 }
537                 Iterator   bodyNodeIterator = bodyElement.nodeIterator();
538                 while(bodyNodeIterator.hasNext()) {
539                     try {
540                         writer.write((org.dom4j.Node)bodyNodeIterator.next());
541                     } catch (IOException   e) {
542                         log.error("Error ("
543                                 + e.getClass().getName()
544                                 + ") writing the body back out:\n"
545                                 + bodyElement.asXML(),
546                                 e);
547                         throw new RuntimeException  (e);
548                     }
549                 }
550             }
551             return outStream.toString();
552         }
553     }
554 
555 
556     /**
557      * <p>
558      * Get the name of the source or file, used for debugging.
559      * </p>
560      *
561      * @return the current value of the source name, output by the parser for
562      *         debugging.
563      */
564     public final String   getSourceName() {
565         return sourceName;
566     }
567 
568     /**
569      * <p>
570      * Stores whether or not <code>format</code> should return just plain
571      * text with line feeds and converted horizonal rule. If <code>true</code>,
572      * fomratted text is returned, otherwise formatted
573      * HTML is returned.
574      * </p>
575      *
576      * @return Returns formattedText.
577      */
578     public boolean isFormattedText() {
579         return formattedText;
580     }
581 
582     /**
583      * <p>
584      * Get whether or not the parser will only return plain text.
585      * </p>
586      *
587      * @return <code>true</code> if the parser will only return plain text,
588      *         otherwise <code>false</code>.
589      */
590     public boolean isTextOnly() {
591         return textOnly;
592     }
593 
594     /**
595      * <p>Write something other than a new line.</p>
596      */
597     private void notTextNewLine() {
598         textNewLineCount = 0;
599         textAtStartOfLine = false;
600     }
601     /**
602      * <p>
603      * Stores whether or not <code>format</code> should return just plain
604      * text with line feeds and converted horizonal rule. If <code>true</code>,
605      * fomratted text is returned, otherwise formatted
606      * HTML is returned.
607      * </p>
608      *
609      * @param formattedText The new value of formattedText to set.
610      */
611     public final void setFormattedText(final boolean formattedText) {
612         this.formattedText = formattedText;
613     }
614 
615     /**
616      * <p>
617      * Contains uri to prepend when src attribute of an image begins with
618      * <em>cid: </em>- it's and embedded attachment
619      * </p>
620      *
621      * @param imageUri -
622      *            the uri to prepend when src attribute of an image begins with
623      *            <em>cid: </em>
624      */
625     public final void setImageUri(final String   imageUri) {
626         this.imageUri = imageUri;
627     }
628 
629     /**
630      * <p>
631      * The information to append to rewritten uris of embedded attachments
632      * </p>
633      *
634      * @param imageUriAppend
635      *            the information to append to rewritten uris of embedded
636      *            attachments
637      */
638     public final void setImageUriAppend(final String   imageUriAppend) {
639         this.imageUriAppend = imageUriAppend;
640     }
641 
642     /**
643      * <p>
644      * If <code>true</code> then only the contents of the body tag are returned.
645      * </p>
646      *
647      * @param onlyBodyContents
648      *            set to <code>true</code to specify that the
649      * parser should only include the contents of the body tag.
650      */
651     public final void setOnlyBodyContents(final boolean onlyChildren) {
652         this.onlyBodyContents = onlyChildren;
653     }
654 
655     /**
656      * <p>
657      * Set the name of the source or file, used for debugging.
658      * </p>
659      *
660      * @param sourceName
661      *            the current value of the source name, output by the parser for
662      *            debugging.
663      */
664     public final void setSourceName(final String   sourceName) {
665         this.sourceName = sourceName;
666     }
667 
668     /**
669      * <p>
670      * Set whether or not the parser should only return plain text.
671      * </p>
672      *
673      * @param textOnly
674      *            set to <code>true</code> if the parser should only return
675      *            plain text, otherwise <code>false</code>.
676      */
677     public final void setTextOnly(final boolean textOnly) {
678         this.textOnly = textOnly;
679     }
680 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags