KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ivata > groupware > web > format > SanitizerFormat


1 package com.ivata.groupware.web.format;
2
3 import java.io.ByteArrayInputStream JavaDoc;
4 import java.io.ByteArrayOutputStream JavaDoc;
5 import java.io.IOException JavaDoc;
6 import java.io.InputStream JavaDoc;
7 import java.io.UnsupportedEncodingException JavaDoc;
8 import java.util.Iterator JavaDoc;
9
10 import javax.xml.transform.TransformerConfigurationException JavaDoc;
11
12 import org.apache.log4j.Logger;
13 import org.dom4j.DocumentException;
14 import org.dom4j.io.SAXReader;
15 import org.dom4j.io.XMLWriter;
16 import org.w3c.dom.Comment JavaDoc;
17 import org.w3c.dom.Document JavaDoc;
18 import org.w3c.dom.Element JavaDoc;
19 import org.w3c.dom.EntityReference JavaDoc;
20 import org.w3c.dom.NamedNodeMap JavaDoc;
21 import org.w3c.dom.Node JavaDoc;
22 import org.w3c.dom.NodeList JavaDoc;
23 import org.w3c.dom.Text JavaDoc;
24 import org.w3c.tidy.Tidy;
25
26 import com.ivata.mask.util.StringHandling;
27 import com.ivata.mask.web.format.CharacterEntityFormat;
28 import com.ivata.mask.web.format.HTMLFormat;
29 import com.ivata.mask.web.format.HTMLFormatter;
30
31 /*
32  * Copyright (c) 2001 - 2005 ivata limited.
33  * All rights reserved.
34  * -----------------------------------------------------------------------------
35  * ivata groupware may be redistributed under the GNU General Public
36  * License as published by the Free Software Foundation;
37  * version 2 of the License.
38  *
39  * These programs are free software; you can redistribute them and/or
40  * modify them under the terms of the GNU General Public License
41  * as published by the Free Software Foundation; version 2 of the License.
42  *
43  * These programs are distributed in the hope that they will be useful,
44  * but WITHOUT ANY WARRANTY; without even the implied warranty of
45  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
46  *
47  * See the GNU General Public License in the file LICENSE.txt for more
48  * details.
49  *
50  * If you would like a copy of the GNU General Public License write to
51  *
52  * Free Software Foundation, Inc.
53  * 59 Temple Place - Suite 330
54  * Boston, MA 02111-1307, USA.
55  *
56  *
57  * To arrange commercial support and licensing, contact ivata at
58  * http://www.ivata.com/contact.jsp
59  * -----------------------------------------------------------------------------
60  * $Log: SanitizerFormat.java,v $
61  * Revision 1.4 2005/04/30 13:07:31 colinmacleod
62  * Added EntityResolver so you don't need an
63  * internet connection.
64  *
65  * Revision 1.3 2005/04/10 20:32:00 colinmacleod
66  * Added new themes.
67  * Changed id type to String.
68  * Changed i tag to em and b tag to strong.
69  * Improved PicoContainerFactory with NanoContainer scripts.
70  *
71  * Revision 1.2 2005/04/09 17:19:42 colinmacleod
72  * Changed copyright text to GPL v2 explicitly.
73  *
74  * Revision 1.1.1.1 2005/03/10 17:49:53 colinmacleod
75  * Restructured ivata op around Hibernate/PicoContainer.
76  * Renamed ivata groupware.
77  *
78  * Revision 1.7 2004/11/03 15:54:32 colinmacleod
79  * Changed todo comments to TODO: all caps.
80  *
81  * Revision 1.6 2004/09/30 14:58:06 colinmacleod
82  * Bugfixes for documents with no surrounding tags.
83  * Added log4j.
84  *
85  * Revision 1.5 2004/08/01 11:54:07 colinmacleod
86  * Removed ivata groupware custom HTML parser in favor of JTidy.
87  *
88  * Revision 1.4 2004/07/13 19:48:08 colinmacleod
89  * Moved project to POJOs from EJBs.
90  * Applied PicoContainer to services layer (replacing session EJBs).
91  * Applied Hibernate to persistence layer (replacing entity EJBs).
92  *
93  * Revision 1.3 2004/03/21 21:16:27 colinmacleod
94  * Shortened name to ivata op.
95  *
96  * Revision 1.2 2004/02/01 22:07:30 colinmacleod
97  * Added full names to author tags
98  *
99  * Revision 1.1.1.1 2004/01/27 20:58:30 colinmacleod
100  * Moved ivata openportal to SourceForge..
101  *
102  * Revision 1.2 2003/10/15 14:15:36 colin
103  * fixing for XDoclet
104  *
105  * Revision 1.2 2003/05/06 13:42:25 peter
106  * added embedded IMG attachments functionality
107  *
108  * Revision 1.1 2003/02/24 19:33:33 colin
109  * moved to jsp
110  *
111  * Revision 1.2 2003/02/04 17:43:46 colin
112  * copyright notice
113  *
114  * Revision 1.1 2002/08/10 21:17:48 colin
115  * first version of HTML sanitizer/parser to clean up HTML code
116  */

117
118 /**
119  * <p>
120  * This class uses the parser defined in {@linkcom.ivata.groupware.web.parser} to
121  * tidy up the HTML and posibly convert it to text-only.
122  * </p>
123  *
124  * @since 2002-08-10
125  * @author Colin MacLeod
126  * <a HREF='mailto:colin.macleod@ivata.com'>colin.macleod@ivata.com</a>
127  * @version $Revision: 1.4 $
128  * @see com.ivata.groupware.web.parser
129  */

130 public class SanitizerFormat implements HTMLFormat {
131     /**
132      * <p>
133      * <strong>Log4J</strong> logger.
134      * </p>
135      */

136     private static Logger log = Logger.getLogger(SanitizerFormat.class);
137     /**
138      * <p>Used to convert character entities back again in text mode.</p>
139      */

140     private CharacterEntityFormat characterEntities = new CharacterEntityFormat();
141     /**
142      * <p>
143      * Stores whether or not <code>format</code> should return just plain
144      * text. If <code>true</code>, only text is returned, otherwise formatted
145      * HTML is returned.
146      * </p>
147      */

148     private boolean formattedText = false;
149     /**
150      * <p>Used to convert character entities back again in text mode.</p>
151      */

152     private HTMLFormatter formatter = new HTMLFormatter();
153
154     /**
155      * <p>
156      * Contains uri to prepend when src attribute of an image begins with
157      * <em>cid: </em>- it's and embedded attachment
158      * </p>
159      */

160     private String JavaDoc imageUri = null;
161
162     /**
163      * <p>
164      * The information to append to rewritten uris of embedded attachments
165      * </p>
166      */

167     private String JavaDoc imageUriAppend = null;
168
169     /**
170      * <p>
171      * If <code>true</code> then only the contents of the body tag are returned.
172      * </p>
173      */

174     private boolean onlyBodyContents = false;
175
176     /**
177      * <p>
178      * Stores name of the source or file to output for debugging.
179      * </p>
180      */

181     private String JavaDoc sourceName = "user input";
182
183     /**
184      * <p>Remember whether or not we're at the start of a line in a text file.</p>
185      */

186     private boolean textAtStartOfLine = true;
187
188     /**
189      * <p>Remember how many newlines we've made in a text file.</p>
190      */

191     private int textNewLineCount = 0;
192
193     /**
194      * <p>
195      * Stores whether or not <code>format</code> should return just plain
196      * text with line feeds and converted horizonal rule. If <code>true</code>,
197      * fomratted text is returned, otherwise formatted
198      * HTML is returned.
199      * </p>
200      */

201     private boolean textOnly = false;
202
203     /**
204      * <p>
205      * This tidy instance does all the hard work.
206      * </p>
207      */

208     private Tidy tidy = new Tidy();
209
210     /**
211      * <p>
212      * Default constructor.
213      * </p>
214      */

215     public SanitizerFormat() {
216         tidy.setBreakBeforeBR(true);
217         tidy.setIndentContent(true);
218         tidy.setMakeClean(true);
219         tidy.setOnlyErrors(true);
220         tidy.setQuiet(true);
221         tidy.setUpperCaseAttrs(false);
222         tidy.setUpperCaseTags(false);
223         tidy.setXmlOut(true);
224         // these objects are used to convert character entities back again
225
characterEntities.setReverse(true);
226         formatter.add(characterEntities);
227     }
228
229     /**
230      * <p>Convert an closing tag element to text.</p>
231      *
232      * @param element element which is closed.
233      * @param buffer <code>PrintWriter</code> to send the results to.
234      */

235     private void addCloseElementAsText(final Element JavaDoc element,
236             final StringBuffer JavaDoc buffer) {
237         // follow table cells with a tab
238
if(element.getTagName().equals("A")) {
239             // see what the link was
240
if(element.hasAttribute("href")) {
241                 notTextNewLine();
242                 buffer.append(" (" + element.getAttribute("href") + ")");
243             }
244         } else if(element.getTagName().equals("HR") ||
245                   element.getTagName().equals("H1") ||
246                   element.getTagName().equals("H2") ||
247                   element.getTagName().equals("H3") ||
248                   element.getTagName().equals("H4") ||
249                   element.getTagName().equals("H5") ||
250                   element.getTagName().equals("H6")) {
251             addTextNewLine(buffer);
252             buffer.append("____________________________________________________________\n");
253         } else if(element.getTagName().equals("B") ||
254                   element.getTagName().equals("BIG") ||
255                   element.getTagName().equals("EM") ||
256                   element.getTagName().equals("I") ||
257                   element.getTagName().equals("STRONG") ||
258                   element.getTagName().equals("U")) {
259             notTextNewLine();
260             buffer.append("__");
261         }else if(element.getTagName().equals("TR") ||
262                  element.getTagName().equals("TD") ||
263                  element.getTagName().equals("TH") ||
264                  element.getTagName().equals("P") ||
265                  element.getTagName().equals("BR") ||
266                  element.getTagName().equals("CITE") ||
267                  element.getTagName().equals("LI") ||
268                  element.getTagName().equals("BLOCKQUOTE")) {
269             addTextNewLine(buffer);
270         }
271     }
272
273
274     /**
275      * <p>Convert an open tag element to text.</p>
276      *
277      * @param element element which is opened.
278      * @param buffer <code>PrintWriter</code> to send the results to.
279      */

280     private void addOpenElementAsText(final Element JavaDoc element,
281             final StringBuffer JavaDoc buffer) {
282         // precede some tags with a character in read-only mode
283
if(element.getTagName().equals("BLOCKQUOTE") ||
284            element.getTagName().equals("CITE") ||
285            element.getTagName().equals("H1") ||
286            element.getTagName().equals("H2") ||
287            element.getTagName().equals("H3") ||
288            element.getTagName().equals("H4") ||
289            element.getTagName().equals("H5") ||
290            element.getTagName().equals("H6") ||
291            element.getTagName().equals("OL") ||
292            element.getTagName().equals("UL") ||
293            element.getTagName().equals("TABLE") ||
294            element.getTagName().equals("P") ||
295            element.getTagName().equals("CITE") ||
296            element.getTagName().equals("BLOCKQUOTE")) {
297             addTextNewLine(buffer);
298         } else if(element.getTagName().equals("B") ||
299                   element.getTagName().equals("BIG") ||
300                   element.getTagName().equals("EM") ||
301                   element.getTagName().equals("I") ||
302                   element.getTagName().equals("STRONG") ||
303                   element.getTagName().equals("U")) {
304             notTextNewLine();
305             buffer.append("__");
306         } else if(element.getTagName().equals("LI")) {
307             // TODO: work buffer somehow if it is ol or ul
308
addTextNewLine(buffer);
309             notTextNewLine();
310             buffer.append(" * ");
311         } else if(element.getTagName().equals("IMG")) {
312             // see if there is an alternate text for this image
313
if(element.hasAttribute("alt")) {
314                 notTextNewLine();
315                 buffer.append(formatter.format(element.getAttribute("alt").trim()));
316             } else if(element.hasAttribute("title")) {
317                 notTextNewLine();
318                 buffer.append(formatter.format(element.getAttribute("title").trim()));
319             }
320         }
321     }
322
323
324     /**
325      * <p>Write a text new line.</p>
326      */

327     private void addTextNewLine(final StringBuffer JavaDoc buffer) {
328         if(textNewLineCount < 2) {
329             textAtStartOfLine = true;
330             buffer.append("\n");
331             ++textNewLineCount;
332         }
333     }
334
335     /**
336      * <p>Add a string representation of the given element to the buffer.</p>
337      *
338      * @param node node to add, and to add all of the children for.
339      * @param onlyChildren if <code>true</code> then only the children of this
340      * node are added, otherwise the node itself is added too.
341      * @param buffer <code>PrintWriter</code> to send the results to.
342      * @param textOnly if <code>true</code>, only text is returned, otherwise
343      * formatted HTML is returned.
344      */

345     private void addToBuffer(final Node JavaDoc node,
346             final StringBuffer JavaDoc buffer) throws IOException JavaDoc {
347         Element JavaDoc element = null;
348         if(formattedText && Element JavaDoc.class.isInstance(node)) {
349             element = (Element JavaDoc) node;
350             NamedNodeMap JavaDoc attributes = element.getAttributes();
351             addOpenElementAsText(element, buffer);
352         } else if(formattedText && Comment JavaDoc.class.isInstance(node)) {
353             // ignore comments in text mode
354
} else if(formattedText && EntityReference JavaDoc.class.isInstance(node)) {
355             EntityReference JavaDoc entity = (EntityReference JavaDoc) node;
356             buffer.append("&");
357             buffer.append(entity.getNodeName());
358             buffer.append(";");
359         } else if(Text JavaDoc.class.isInstance(node)) {
360             Text JavaDoc text = (Text JavaDoc) node;
361             String JavaDoc data = text.getData();
362             StringBuffer JavaDoc dataReformatted = new StringBuffer JavaDoc();
363             if(data != null) {
364                 // strip buffer any funny characters and double spaces
365
int length = data.length();
366                 boolean lastWasSpace = false;
367                 boolean atStart = textAtStartOfLine;
368                 for(int index = 0; index < length; ++index) {
369                     // newlines, carriage returns and tabs are all spaces now
370
if((data.charAt(index) == '\n') ||
371                        (data.charAt(index) == '\r') ||
372                        (data.charAt(index) == ' ') ||
373                        (data.charAt(index) == '\t')) {
374                         // ignore double spaces
375
if(!lastWasSpace) {
376                             lastWasSpace = true;
377                             if(!textAtStartOfLine) {
378                                 dataReformatted.append(' ');
379                             }
380                         }
381                     } else {
382                         lastWasSpace = false;
383                         atStart = false;
384                         dataReformatted.append(data.charAt(index));
385                     }
386                 }
387                 if(!(data = dataReformatted.toString()).equals("")) {
388                     buffer.append(formatter.format(data));
389                     notTextNewLine();
390                 }
391             }
392         } else {
393             String JavaDoc value = node.getNodeValue();
394             if(!StringHandling.isNullOrEmpty(value)) {
395                 notTextNewLine();
396                 buffer.append(value);
397             }
398         }
399
400         // if that doesn't work, try the children
401
if(node.hasChildNodes() &&
402            ((element == null) ||
403             // these are the tags to ignore the contents of in text mode
404
(!element.getTagName().equals("APPLET") &&
405              !element.getTagName().equals("EMBED") &&
406              !element.getTagName().equals("SCRIPT")))) {
407             NodeList JavaDoc children = node.getChildNodes();
408             for(int index = 0; index < children.getLength(); ++index) {
409                 Node JavaDoc nextChild = children.item(index);
410                 addToBuffer(nextChild, buffer);
411             }
412         }
413         // in text only mode, certain elements are followed by a special character
414
if(element != null) {
415             addCloseElementAsText(element, buffer);
416         }
417
418     }
419
420     /**
421      * <p>
422      * Internal method which converts the <strong>HTML</strong> into plain text.
423      * </p>
424      *
425      * @param element Root <strong>HTML</strong> element to be converted.
426      * @return Plain text matching the <strong>HTML</strong>.
427      */

428     private String JavaDoc convertToText(final Document JavaDoc document) {
429         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
430         try {
431             addToBuffer(document, buffer);
432         } catch (IOException JavaDoc e) {
433             e.printStackTrace();
434             return "ERROR: " + e.getMessage();
435         }
436         return buffer.toString();
437     }
438
439     /**
440      * <p>
441      * Format the string given in <code>hTMLText</code> and clean up the
442      * syntax of the HTML.
443      * </p>
444      *
445      * @param hTMLTextParam
446      * the text to truncate.
447      * @throws TransformerConfigurationException
448      */

449     public String JavaDoc format(final String JavaDoc hTMLTextParam) {
450         if (hTMLTextParam == null) {
451             if (log.isDebugEnabled()) {
452                 log.debug("Null input received - returning null.");
453             }
454             return null;
455         }
456         if (hTMLTextParam.trim().length() == 0) {
457             if (log.isDebugEnabled()) {
458                 log.debug("Empty input received - returning input unchanged.");
459             }
460             return hTMLTextParam;
461         }
462         // basic sanity check - if there is no HTML tag, assume we only have
463
// body content.
464
String JavaDoc lowerCaseText = hTMLTextParam.toLowerCase();
465         boolean hasHTMLTag = lowerCaseText.indexOf("<HTML") != -1;
466         String JavaDoc hTMLText;
467         if (!hasHTMLTag) {
468             if (log.isDebugEnabled()) {
469                 log.debug("No HTML tag found - surrounding everything with HTML and BODY.");
470             }
471             StringBuffer JavaDoc newHTMLText = new StringBuffer JavaDoc();
472             newHTMLText.append("<HTML><head><title></title></head><body>");
473             newHTMLText.append(hTMLTextParam);
474             newHTMLText.append("</body></HTML>");
475             hTMLText = newHTMLText.toString();
476         } else {
477             hTMLText = hTMLTextParam;
478         }
479
480
481         // TOTAL HACK to convert JSP tags to entities
482
if (hTMLText.indexOf("<%") != -1) {
483             hTMLText = hTMLText.replaceAll("<%", "&lt;%");
484         }
485         if (hTMLText.indexOf("%>") != -1) {
486             hTMLText = hTMLText.replaceAll("%>", "%&gt;");
487         }
488
489         InputStream JavaDoc inStream = new ByteArrayInputStream JavaDoc(hTMLText.getBytes());
490         Document JavaDoc document = tidy.parseDOM(inStream, null);
491         if (textOnly) {
492             if (log.isDebugEnabled()) {
493                 log.debug("Converting document to text.");
494             }
495             return convertToText(document);
496         } else {
497             ByteArrayOutputStream JavaDoc outStream = new ByteArrayOutputStream JavaDoc();
498             tidy.pprint(document, outStream);
499
500             if (onlyBodyContents
501                     && (outStream.toString().trim().length() > 0)) {
502                 SAXReader saxReader = new SAXReader();
503                 String JavaDoc text = outStream.toString();
504                 // EVEN BIGGER HACK to remove previous over-zealous dash replacement
505
if (text.indexOf("&minus;") != -1) {
506                     text = text.replaceAll("&minus;", "-");
507                 }
508                 inStream = new ByteArrayInputStream JavaDoc(text.getBytes());
509                 org.dom4j.Document dom4jDocument;
510                 try {
511                     dom4jDocument = saxReader.read(inStream);
512                 } catch (DocumentException e) {
513                     log.error("Error ("
514                             + e.getClass().getName()
515                             + ") reading the document back in after Tidy:\n"
516                             + outStream.toString(),
517                             e);
518                     throw new RuntimeException JavaDoc(e);
519                 }
520                 org.dom4j.Element rootElement = dom4jDocument.getRootElement();
521                 org.dom4j.Element bodyElement = rootElement.element("body");
522                 if (bodyElement == null) {
523                     return null;
524                 }
525                 outStream = new ByteArrayOutputStream JavaDoc();
526                 XMLWriter writer;
527                 try {
528                     writer = new XMLWriter(outStream,
529                             new org.dom4j.io.OutputFormat("", true));
530                 } catch (UnsupportedEncodingException JavaDoc e) {
531                     log.error("Error ("
532                             + e.getClass().getName()
533                             + ") creating the document to write back out.",
534                             e);
535                     throw new RuntimeException JavaDoc(e);
536                 }
537                 Iterator JavaDoc bodyNodeIterator = bodyElement.nodeIterator();
538                 while(bodyNodeIterator.hasNext()) {
539                     try {
540                         writer.write((org.dom4j.Node)bodyNodeIterator.next());
541                     } catch (IOException JavaDoc e) {
542                         log.error("Error ("
543                                 + e.getClass().getName()
544                                 + ") writing the body back out:\n"
545                                 + bodyElement.asXML(),
546                                 e);
547                         throw new RuntimeException JavaDoc(e);
548                     }
549                 }
550             }
551             return outStream.toString();
552         }
553     }
554
555
556     /**
557      * <p>
558      * Get the name of the source or file, used for debugging.
559      * </p>
560      *
561      * @return the current value of the source name, output by the parser for
562      * debugging.
563      */

564     public final String JavaDoc getSourceName() {
565         return sourceName;
566     }
567
568     /**
569      * <p>
570      * Stores whether or not <code>format</code> should return just plain
571      * text with line feeds and converted horizonal rule. If <code>true</code>,
572      * fomratted text is returned, otherwise formatted
573      * HTML is returned.
574      * </p>
575      *
576      * @return Returns formattedText.
577      */

578     public boolean isFormattedText() {
579         return formattedText;
580     }
581
582     /**
583      * <p>
584      * Get whether or not the parser will only return plain text.
585      * </p>
586      *
587      * @return <code>true</code> if the parser will only return plain text,
588      * otherwise <code>false</code>.
589      */

590     public boolean isTextOnly() {
591         return textOnly;
592     }
593
594     /**
595      * <p>Write something other than a new line.</p>
596      */

597     private void notTextNewLine() {
598         textNewLineCount = 0;
599         textAtStartOfLine = false;
600     }
601     /**
602      * <p>
603      * Stores whether or not <code>format</code> should return just plain
604      * text with line feeds and converted horizonal rule. If <code>true</code>,
605      * fomratted text is returned, otherwise formatted
606      * HTML is returned.
607      * </p>
608      *
609      * @param formattedText The new value of formattedText to set.
610      */

611     public final void setFormattedText(final boolean formattedText) {
612         this.formattedText = formattedText;
613     }
614
615     /**
616      * <p>
617      * Contains uri to prepend when src attribute of an image begins with
618      * <em>cid: </em>- it's and embedded attachment
619      * </p>
620      *
621      * @param imageUri -
622      * the uri to prepend when src attribute of an image begins with
623      * <em>cid: </em>
624      */

625     public final void setImageUri(final String JavaDoc imageUri) {
626         this.imageUri = imageUri;
627     }
628
629     /**
630      * <p>
631      * The information to append to rewritten uris of embedded attachments
632      * </p>
633      *
634      * @param imageUriAppend
635      * the information to append to rewritten uris of embedded
636      * attachments
637      */

638     public final void setImageUriAppend(final String JavaDoc imageUriAppend) {
639         this.imageUriAppend = imageUriAppend;
640     }
641
642     /**
643      * <p>
644      * If <code>true</code> then only the contents of the body tag are returned.
645      * </p>
646      *
647      * @param onlyBodyContents
648      * set to <code>true</code to specify that the
649      * parser should only include the contents of the body tag.
650      */

651     public final void setOnlyBodyContents(final boolean onlyChildren) {
652         this.onlyBodyContents = onlyChildren;
653     }
654
655     /**
656      * <p>
657      * Set the name of the source or file, used for debugging.
658      * </p>
659      *
660      * @param sourceName
661      * the current value of the source name, output by the parser for
662      * debugging.
663      */

664     public final void setSourceName(final String JavaDoc sourceName) {
665         this.sourceName = sourceName;
666     }
667
668     /**
669      * <p>
670      * Set whether or not the parser should only return plain text.
671      * </p>
672      *
673      * @param textOnly
674      * set to <code>true</code> if the parser should only return
675      * plain text, otherwise <code>false</code>.
676      */

677     public final void setTextOnly(final boolean textOnly) {
678         this.textOnly = textOnly;
679     }
680 }
Popular Tags