CmsHtmlConverter


1   /*
2    * File   : $Source: /usr/local/cvs/opencms/src/org/opencms/util/CmsHtmlConverter.java,v $
3    * Date   : $Date: 2006/10/06 09:17:16 $
4    * Version: $Revision: 1.27 $
5    *
6    * This library is part of OpenCms -
7    * the Open Source Content Mananagement System
8    *
9    * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10   *
11   * This library is free software; you can redistribute it and/or
12   * modify it under the terms of the GNU Lesser General Public
13   * License as published by the Free Software Foundation; either
14   * version 2.1 of the License, or (at your option) any later version.
15   *
16   * This library is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19   * Lesser General Public License for more details.
20   *
21   * For further information about Alkacon Software GmbH, please see the
22   * company website: http://www.alkacon.com
23   *
24   * For further information about OpenCms, please see the
25   * project website: http://www.opencms.org
26   * 
27   * You should have received a copy of the GNU Lesser General Public
28   * License along with this library; if not, write to the Free Software
29   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
30   */
31  
32  package org.opencms.util;
33  
34  import org.opencms.file.CmsObject;
35  import org.opencms.file.CmsProperty;
36  import org.opencms.file.CmsPropertyDefinition;
37  import org.opencms.file.CmsResource;
38  import org.opencms.i18n.CmsEncoder;
39  import org.opencms.main.CmsException;
40  import org.opencms.main.CmsLog;
41  
42  import java.io.ByteArrayInputStream  ;
43  import java.io.ByteArrayOutputStream  ;
44  import java.io.UnsupportedEncodingException  ;
45  import java.util.ArrayList  ;
46  import java.util.List  ;
47  import java.util.Properties  ;
48  import java.util.StringTokenizer  ;
49  import java.util.regex.Pattern  ;
50  
51  import org.apache.commons.logging.Log;
52  
53  import org.w3c.tidy.Tidy;
54  
55  /**
56   * Html cleaner and pretty printer.<p>
57   * 
58   * Used to clean up html code (e.g. remove word tags) and optionally create xhtml from html.<p>
59   *   
60   * @author Michael Emmerich 
61   * @author Alexander Kandzior
62   * 
63   * @version $Revision: 1.27 $ 
64   * 
65   * @since 6.0.0 
66   */
67  public class CmsHtmlConverter {
68  
69      /** Param value for disabled mode. **/
70      public static final String   PARAM_DISABLED = CmsStringUtil.FALSE;
71  
72      /** Param value for enabled mode. **/
73      public static final String   PARAM_ENABLED = CmsStringUtil.TRUE;
74  
75      /** Param value for WORD mode. **/
76      public static final String   PARAM_WORD = "cleanup";
77  
78      /** Param value for XHTML mode. **/
79      public static final String   PARAM_XHTML = "xhtml";
80  
81      /** The log object for this class. */
82      private static final Log LOG = CmsLog.getLog(CmsHtmlConverter.class);
83  
84      /** Regular expression for cleanup. */
85      String  [] m_cleanupPatterns = {
86          "<o:p>.*(\\r\\n)*.*</o:p>",
87          "<o:p>.*(\\r\\n)*.*</O:p>",
88          "<\\?xml:.*(\\r\\n).*/>",
89          "<\\?xml:.*(\\r\\n).*(\\r\\n).*/\\?>",
90          "<\\?xml:.*(\\r\\n).*(\\r\\n).*/>",
91          "<\\?xml:(.*(\\r\\n)).*/\\?>",
92          "<o:SmartTagType.*(\\r\\n)*.*/>",
93          "<o:smarttagtype.*(\\r\\n)*.*/>"};
94  
95      /** Patterns for cleanup. */
96      Pattern  [] m_clearStyle;
97  
98      /** The input encoding. */
99      String   m_encoding;
100 
101     /** Regular expression for replace. */
102     String  [] m_replacePatterns = {
103         "&#160;",
104         "(\\r\\n){2,}",
105         "�",
106         "(\\n){2,}",
107         "\\(\\r\\n<",
108         "\\(\\n<",
109         "\\(\\r\\n(\\ ){1,}<",
110         "\\(\\n(\\ ){1,}<",
111         "\\r\\n<span",
112         "\\n<span"};
113 
114     /** Patterns for replace. */
115     Pattern  [] m_replaceStyle;
116 
117     /** Values for replace. */
118     String  [] m_replaceValues = {"&nbsp;", "", "&ndash;", "", "(<", "(<", "(<", "(<", "<span", "<span"};
119 
120     /** The tidy to use. */
121     Tidy m_tidy;
122 
123     /** The length of the line separator. */
124     private int m_lineSeparatorLength;
125 
126     /** Indicates if this converter is enabled or not. */
127     private boolean m_modeEnabled;
128 
129     /** Indicates if word cleanup mode is enabled or not. */
130     private boolean m_modeWord;
131 
132     /** Indicates if xhtml conversion mode is enabled or not. */
133     private boolean m_modeXhtml;
134 
135     /**
136      * Constructor, creates a new CmsHtmlConverter.<p>
137      * 
138      * The encoding used by default is {@link CmsEncoder#ENCODING_UTF_8}.<p>
139      */
140     public CmsHtmlConverter() {
141 
142         init(CmsEncoder.ENCODING_UTF_8, PARAM_ENABLED);
143     }
144 
145     /**
146      * Constructor, creates a new CmsHtmlConverter.<p>
147      * 
148      * Possible values for the conversion mode are:<ul>
149      * <li>{@link #PARAM_DISABLED}: The conversion is disabled.
150      * <li>{@link #PARAM_ENABLED}: Conversion is enabled without transformation, so html is pretty printed only. 
151      * <li>{@link #PARAM_XHTML}: Conversion from html to xhtml is enabled.
152      * <li>{@link #PARAM_WORD}: Cleanup of word like html tags is enabled.
153      * </ul>
154      * Values can be combined with the <code>;</code> separator, so it's possible to convert 
155      * to xhtml and clean from word at the same time.<p>
156      * 
157      * @param encoding the encoding used for the html code conversion
158      * @param mode the conversion mode to use
159      */
160     public CmsHtmlConverter(String   encoding, String   mode) {
161 
162         init(encoding, mode);
163     }
164 
165     /**
166      * Reads the content conversion property of a given resource and returns it's value.<p>
167      * 
168      * A default value (disabled) is returned if the property could not be read.<p>
169      * 
170      * @param cms the CmsObject
171      * @param resource the resource in the vfs
172      * @return the content conversion property value
173      */
174     public static String   getConversionSettings(CmsObject cms, CmsResource resource) {
175 
176         // read the content-conversion property
177         String   contentConversion;
178         try {
179             String   resourceName = cms.getSitePath(resource);
180             CmsProperty contentConversionProperty = cms.readPropertyObject(
181                 resourceName,
182                 CmsPropertyDefinition.PROPERTY_CONTENT_CONVERSION,
183                 true);
184             contentConversion = contentConversionProperty.getValue();
185         } catch (CmsException e) {
186             // if there was an error reading the property, choose a default value
187             contentConversion = CmsHtmlConverter.PARAM_DISABLED;
188         }
189         return contentConversion;
190     }
191 
192     /**
193      * Tests if the content conversion is enabled.<p>
194      * 
195      * @param conversionMode the content conversion mode string
196      * @return ture or false
197      */
198     public static boolean isConversionEnabled(String   conversionMode) {
199 
200         boolean value = true;
201         if ((conversionMode == null) || (conversionMode.indexOf(PARAM_DISABLED) != -1)) {
202             value = false;
203         }
204         return value;
205     }
206 
207     /**
208      * Converts the given html code according to the settings of this converter.<p>
209      * 
210      * @param htmlInput html input stored in an array of bytes
211      * @return array of bytes contining the converted html
212      * 
213      * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
214      */
215     public byte[] convertToByte(byte[] htmlInput) throws UnsupportedEncodingException   {
216 
217         if (m_modeEnabled) {
218             // only do any processing if the conversion is enabled
219             return convertToByte(new String  (htmlInput, m_encoding));
220         }
221         return htmlInput;
222     }
223 
224     /**
225      * Converts the given html code according to the settings of this converter.<p>
226      * 
227      * @param htmlInput html input stored in a string
228      * @return array of bytes contining the converted html
229      * 
230      * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
231      */
232     public byte[] convertToByte(String   htmlInput) throws UnsupportedEncodingException   {
233 
234         return convertToString(htmlInput).getBytes(m_encoding);
235     }
236 
237     /**
238      * Converts the given html code according to the settings of this converter.<p>
239      * 
240      * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
241      * 
242      * @param htmlInput html input stored in an array of bytes
243      * @return array of bytes contining the converted html
244      */
245     public byte[] convertToByteSilent(byte[] htmlInput) {
246 
247         try {
248             return convertToByte(htmlInput);
249         } catch (Exception   e) {
250             if (LOG.isWarnEnabled()) {
251                 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
252             }
253             return htmlInput;
254         }
255     }
256 
257     /**
258      * Converts the given html code according to the settings of this converter.<p>
259      * 
260      * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
261      * 
262      * @param htmlInput html input stored in a string
263      * @return array of bytes contining the converted html
264      */
265     public byte[] convertToByteSilent(String   htmlInput) {
266 
267         try {
268             return convertToByte(htmlInput.getBytes(m_encoding));
269         } catch (Exception   e) {
270             if (LOG.isWarnEnabled()) {
271                 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
272             }
273             try {
274                 return htmlInput.getBytes(m_encoding);
275             } catch (UnsupportedEncodingException   e1) {
276                 if (LOG.isWarnEnabled()) {
277                     LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1);
278                 }
279                 return htmlInput.getBytes();
280             }
281         }
282     }
283 
284     /**
285      * Converts the given html code according to the settings of this converter.<p>
286      * 
287      * @param htmlInput html input stored in an array of bytes
288      * @return string contining the converted html
289      * 
290      * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
291      */
292     public String   convertToString(byte[] htmlInput) throws UnsupportedEncodingException   {
293 
294         return convertToString(new String  (htmlInput, m_encoding));
295     }
296 
297     /**
298      * Converts the given html code according to the settings of this converter.<p>
299      * 
300      * @param htmlInput html input stored in a string
301      * @return string contining the converted html
302      * 
303      * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
304      */
305     public String   convertToString(String   htmlInput) throws UnsupportedEncodingException   {
306 
307         // only do parsing if the mode is not set to disabled
308         if (m_modeEnabled) {
309 
310             // do a maximum of 10 loops
311             int max = m_modeWord ? 10 : 1;
312             int count = 0;
313 
314             // we may have to do several parsing runs until all tags are removed
315             int oldSize = htmlInput.length();
316             String   workHtml = regExp(htmlInput);
317             while (count < max) {
318                 count++;
319 
320                 // first add the optional header if in word mode   
321                 if (m_modeWord) {
322                     workHtml = adjustHtml(workHtml);
323                 }
324                 // now use tidy to parse and format the html
325                 workHtml = parse(workHtml, m_encoding);
326                 if (m_modeWord) {
327                     // cut off the line separator, which is always appended
328                     workHtml = workHtml.substring(0, workHtml.length() - m_lineSeparatorLength);
329                 }
330 
331                 if (workHtml.length() == oldSize) {
332                     // no change in html code after last processing loop
333                     workHtml = regExp(workHtml);
334                     break;
335                 }
336                 oldSize = workHtml.length();
337                 workHtml = regExp(workHtml);
338             }
339             if (LOG.isInfoEnabled()) {
340                 LOG.info(Messages.get().getBundle().key(
341                     Messages.LOG_PARSING_RUNS_2,
342                     this.getClass().getName(),
343                     new Integer  (count)));
344             }
345             htmlInput = workHtml;
346         }
347 
348         return htmlInput;
349     }
350 
351     /**
352      * Converts the given html code according to the settings of this converter.<p>
353      * 
354      * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
355      * 
356      * @param htmlInput html input stored in an array of bytes
357      * 
358      * @return string contining the converted html
359      */
360     public String   convertToStringSilent(byte[] htmlInput) {
361 
362         try {
363             return convertToString(htmlInput);
364         } catch (Exception   e) {
365             if (LOG.isWarnEnabled()) {
366                 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
367             }
368             try {
369                 return new String  (htmlInput, m_encoding);
370             } catch (UnsupportedEncodingException   e1) {
371                 if (LOG.isWarnEnabled()) {
372                     LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1);
373                 }
374                 return new String  (htmlInput);
375             }
376         }
377     }
378 
379     /**
380      * Converts the given html code according to the settings of this converter.<p>
381      * 
382      * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
383      * 
384      * @param htmlInput html input stored in string 
385      * 
386      * @return string contining the converted html
387      */
388     public String   convertToStringSilent(String   htmlInput) {
389 
390         try {
391             return convertToString(htmlInput);
392         } catch (Exception   e) {
393             if (LOG.isWarnEnabled()) {
394                 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
395             }
396             return htmlInput;
397         }
398     }
399 
400     /**
401      * Returns the encoding used for the html code conversion.<p>
402      * 
403      * @return the encoding used for the html code conversion
404      */
405     public String   getEncoding() {
406 
407         return m_encoding;
408     }
409 
410     /**
411      * Adjusts the html input code in WORD mode if nescessary.<p>
412      * 
413      * When in WORD mode, the html tag must contain the xmlns:o="urn:schemas-microsoft-com:office:office"
414      * attribute, otherwiese tide will not remove the WORD tags from the document.
415      * 
416      * @param htmlInput the html input
417      * @return adjusted html input
418      */
419     private String   adjustHtml(String   htmlInput) {
420 
421         // check if we have some opening and closing html tags
422         if ((htmlInput.toLowerCase().indexOf("<html>") == -1) && (htmlInput.toLowerCase().indexOf("</html>") == -1)) {
423             // add a correct <html> tag for word generated html
424             StringBuffer   tmp = new StringBuffer  ();
425             tmp.append("<html xmlns:o=\"\"><body>");
426             tmp.append(htmlInput);
427             tmp.append("</body></html>");
428             htmlInput = tmp.toString();
429         }
430         return htmlInput;
431     }
432 
433     /**
434      * Extracts all mode parameters from the mode property value and stores them in a list.<p>
435      * 
436      * Values must be seperated iwth a semicolon.
437      * 
438      * @param mode the mode paramter string
439      * @return list with all extracted nodes
440      */
441     private List   extractModes(String   mode) {
442 
443         ArrayList   extractedModes = new ArrayList  ();
444         if (mode != null) {
445             StringTokenizer   extract = new StringTokenizer  (mode, ";");
446             while (extract.hasMoreTokens()) {
447                 String   tok = extract.nextToken();
448                 extractedModes.add(tok);
449             }
450         }
451         return extractedModes;
452     }
453 
454     /**
455      * Initializes the CmsHtmlConverter.<p>
456      * 
457      * @param encoding the encoding used for the html code conversion
458      * @param mode the mode parameter to select the operation mode of the converter.
459      */
460     private void init(String   encoding, String   mode) {
461 
462         // extract all operation mode
463         List   modes = extractModes(mode);
464 
465         // confiugurate the tidy depending on the operation mode
466         if (modes.contains(PARAM_ENABLED)) {
467             m_modeEnabled = true;
468         }
469         if (modes.contains(PARAM_XHTML)) {
470             m_modeEnabled = true;
471             m_modeXhtml = true;
472         }
473         if (modes.contains(PARAM_WORD)) {
474             m_modeEnabled = true;
475             m_modeWord = true;
476         }
477 
478         // set the encoding
479         m_encoding = encoding;
480 
481         // get line separator length
482         m_lineSeparatorLength = System.getProperty("line.separator").length();
483 
484         // we need this only if the conversion is enabled
485         if (m_modeEnabled) {
486 
487             // create the main tidy object
488             m_tidy = new Tidy();
489 
490             // set specified word, xhtml conversion settings
491             m_tidy.setXHTML(m_modeXhtml);
492             m_tidy.setWord2000(m_modeWord);
493 
494             // add additional tags
495             // those are required to handle word 2002 (and newer) documents
496             Properties   additionalTags = new Properties  ();
497             additionalTags.put("new-empty-tags", "o:smarttagtype");
498             additionalTags.put("new-inline-tags", "o:smarttagtype");
499             m_tidy.getConfiguration().addProps(additionalTags);
500 
501             // set the default tidy configuration
502 
503             // set the tidy encoding
504             m_tidy.setInputEncoding(encoding);
505             m_tidy.setOutputEncoding(encoding);
506 
507             // disable the tidy meta element in output
508             m_tidy.setTidyMark(false);
509             // disable clean mode
510             m_tidy.setMakeClean(false);
511             // enable num entities
512             m_tidy.setNumEntities(true);
513             // create output of the body only
514             m_tidy.setPrintBodyOnly(true);
515             // force output creation even if there are tidy errors
516             m_tidy.setForceOutput(true);
517             // set tidy to quiet mode to prevent output        
518             m_tidy.setQuiet(true);
519             // disable warning output
520             m_tidy.setShowWarnings(false);
521             // allow comments in the output
522             m_tidy.setHideComments(false);
523             // set no line break before a <br>
524             m_tidy.setBreakBeforeBR(false);
525             // dont wrap attribute values
526             m_tidy.setWrapAttVals(false);
527             // warp lines after 100 chars
528             m_tidy.setWraplen(100);
529             // no indentation
530             m_tidy.setSpaces(0);
531 
532             if (m_modeWord) {
533                 // create the regexp for cleanup, only used in word clean mode
534                 m_clearStyle = new Pattern  [m_cleanupPatterns.length];
535                 for (int i = 0; i < m_cleanupPatterns.length; i++) {
536                     m_clearStyle[i] = Pattern.compile(m_cleanupPatterns[i]);
537                 }
538             }
539 
540             // create the regexp for replace
541             m_replaceStyle = new Pattern  [m_replacePatterns.length];
542             for (int i = 0; i < m_replacePatterns.length; i++) {
543                 m_replaceStyle[i] = Pattern.compile(m_replacePatterns[i]);
544             }
545         }
546     }
547 
548     /**
549      * Parses a byte array containing html code with different parsing modes.<p>
550      * 
551      * @param htmlInput a byte array containing raw html code
552      * @param encoding the  encoding
553      * 
554      * @return parsed and cleared html code
555      * 
556      * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
557      */
558     private String   parse(String   htmlInput, String   encoding) throws UnsupportedEncodingException   {
559 
560         // prepare the streams
561         ByteArrayInputStream   in = new ByteArrayInputStream  (htmlInput.getBytes(encoding));
562         ByteArrayOutputStream   out = new ByteArrayOutputStream  ();
563         // do the parsing
564         m_tidy.parse(in, out);
565         // return the result
566         byte[] result = out.toByteArray();
567         return new String  (result, encoding);
568     }
569 
570     /**
571      * Parses the htmlInput with regular expressions for cleanup purposes.<p>
572      * 
573      * @param htmlInput the html input
574      * @return processed html
575      */
576     private String   regExp(String   htmlInput) {
577 
578         String   parsedHtml = htmlInput.trim();
579 
580         if (m_modeWord) {
581             // process all cleanup regexp
582             for (int i = 0; i < m_cleanupPatterns.length; i++) {
583                 parsedHtml = m_clearStyle[i].matcher(parsedHtml).replaceAll("");
584             }
585         }
586 
587         // process all replace regexp
588         for (int i = 0; i < m_replacePatterns.length; i++) {
589             parsedHtml = m_replaceStyle[i].matcher(parsedHtml).replaceAll(m_replaceValues[i]);
590         }
591 
592         return parsedHtml;
593     }
594 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags