KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > util > CmsHtmlConverter


1 /*
2  * File : $Source: /usr/local/cvs/opencms/src/org/opencms/util/CmsHtmlConverter.java,v $
3  * Date : $Date: 2006/10/06 09:17:16 $
4  * Version: $Revision: 1.27 $
5  *
6  * This library is part of OpenCms -
7  * the Open Source Content Mananagement System
8  *
9  * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * For further information about Alkacon Software GmbH, please see the
22  * company website: http://www.alkacon.com
23  *
24  * For further information about OpenCms, please see the
25  * project website: http://www.opencms.org
26  *
27  * You should have received a copy of the GNU Lesser General Public
28  * License along with this library; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30  */

31
32 package org.opencms.util;
33
34 import org.opencms.file.CmsObject;
35 import org.opencms.file.CmsProperty;
36 import org.opencms.file.CmsPropertyDefinition;
37 import org.opencms.file.CmsResource;
38 import org.opencms.i18n.CmsEncoder;
39 import org.opencms.main.CmsException;
40 import org.opencms.main.CmsLog;
41
42 import java.io.ByteArrayInputStream JavaDoc;
43 import java.io.ByteArrayOutputStream JavaDoc;
44 import java.io.UnsupportedEncodingException JavaDoc;
45 import java.util.ArrayList JavaDoc;
46 import java.util.List JavaDoc;
47 import java.util.Properties JavaDoc;
48 import java.util.StringTokenizer JavaDoc;
49 import java.util.regex.Pattern JavaDoc;
50
51 import org.apache.commons.logging.Log;
52
53 import org.w3c.tidy.Tidy;
54
55 /**
56  * Html cleaner and pretty printer.<p>
57  *
58  * Used to clean up html code (e.g. remove word tags) and optionally create xhtml from html.<p>
59  *
60  * @author Michael Emmerich
61  * @author Alexander Kandzior
62  *
63  * @version $Revision: 1.27 $
64  *
65  * @since 6.0.0
66  */

67 public class CmsHtmlConverter {
68
69     /** Param value for disabled mode. **/
70     public static final String JavaDoc PARAM_DISABLED = CmsStringUtil.FALSE;
71
72     /** Param value for enabled mode. **/
73     public static final String JavaDoc PARAM_ENABLED = CmsStringUtil.TRUE;
74
75     /** Param value for WORD mode. **/
76     public static final String JavaDoc PARAM_WORD = "cleanup";
77
78     /** Param value for XHTML mode. **/
79     public static final String JavaDoc PARAM_XHTML = "xhtml";
80
81     /** The log object for this class. */
82     private static final Log LOG = CmsLog.getLog(CmsHtmlConverter.class);
83
84     /** Regular expression for cleanup. */
85     String JavaDoc[] m_cleanupPatterns = {
86         "<o:p>.*(\\r\\n)*.*</o:p>",
87         "<o:p>.*(\\r\\n)*.*</O:p>",
88         "<\\?xml:.*(\\r\\n).*/>",
89         "<\\?xml:.*(\\r\\n).*(\\r\\n).*/\\?>",
90         "<\\?xml:.*(\\r\\n).*(\\r\\n).*/>",
91         "<\\?xml:(.*(\\r\\n)).*/\\?>",
92         "<o:SmartTagType.*(\\r\\n)*.*/>",
93         "<o:smarttagtype.*(\\r\\n)*.*/>"};
94
95     /** Patterns for cleanup. */
96     Pattern JavaDoc[] m_clearStyle;
97
98     /** The input encoding. */
99     String JavaDoc m_encoding;
100
101     /** Regular expression for replace. */
102     String JavaDoc[] m_replacePatterns = {
103         "&#160;",
104         "(\\r\\n){2,}",
105         "–",
106         "(\\n){2,}",
107         "\\(\\r\\n<",
108         "\\(\\n<",
109         "\\(\\r\\n(\\ ){1,}<",
110         "\\(\\n(\\ ){1,}<",
111         "\\r\\n<span",
112         "\\n<span"};
113
114     /** Patterns for replace. */
115     Pattern JavaDoc[] m_replaceStyle;
116
117     /** Values for replace. */
118     String JavaDoc[] m_replaceValues = {"&nbsp;", "", "&ndash;", "", "(<", "(<", "(<", "(<", "<span", "<span"};
119
120     /** The tidy to use. */
121     Tidy m_tidy;
122
123     /** The length of the line separator. */
124     private int m_lineSeparatorLength;
125
126     /** Indicates if this converter is enabled or not. */
127     private boolean m_modeEnabled;
128
129     /** Indicates if word cleanup mode is enabled or not. */
130     private boolean m_modeWord;
131
132     /** Indicates if xhtml conversion mode is enabled or not. */
133     private boolean m_modeXhtml;
134
135     /**
136      * Constructor, creates a new CmsHtmlConverter.<p>
137      *
138      * The encoding used by default is {@link CmsEncoder#ENCODING_UTF_8}.<p>
139      */

140     public CmsHtmlConverter() {
141
142         init(CmsEncoder.ENCODING_UTF_8, PARAM_ENABLED);
143     }
144
145     /**
146      * Constructor, creates a new CmsHtmlConverter.<p>
147      *
148      * Possible values for the conversion mode are:<ul>
149      * <li>{@link #PARAM_DISABLED}: The conversion is disabled.
150      * <li>{@link #PARAM_ENABLED}: Conversion is enabled without transformation, so html is pretty printed only.
151      * <li>{@link #PARAM_XHTML}: Conversion from html to xhtml is enabled.
152      * <li>{@link #PARAM_WORD}: Cleanup of word like html tags is enabled.
153      * </ul>
154      * Values can be combined with the <code>;</code> separator, so it's possible to convert
155      * to xhtml and clean from word at the same time.<p>
156      *
157      * @param encoding the encoding used for the html code conversion
158      * @param mode the conversion mode to use
159      */

160     public CmsHtmlConverter(String JavaDoc encoding, String JavaDoc mode) {
161
162         init(encoding, mode);
163     }
164
165     /**
166      * Reads the content conversion property of a given resource and returns it's value.<p>
167      *
168      * A default value (disabled) is returned if the property could not be read.<p>
169      *
170      * @param cms the CmsObject
171      * @param resource the resource in the vfs
172      * @return the content conversion property value
173      */

174     public static String JavaDoc getConversionSettings(CmsObject cms, CmsResource resource) {
175
176         // read the content-conversion property
177
String JavaDoc contentConversion;
178         try {
179             String JavaDoc resourceName = cms.getSitePath(resource);
180             CmsProperty contentConversionProperty = cms.readPropertyObject(
181                 resourceName,
182                 CmsPropertyDefinition.PROPERTY_CONTENT_CONVERSION,
183                 true);
184             contentConversion = contentConversionProperty.getValue();
185         } catch (CmsException e) {
186             // if there was an error reading the property, choose a default value
187
contentConversion = CmsHtmlConverter.PARAM_DISABLED;
188         }
189         return contentConversion;
190     }
191
192     /**
193      * Tests if the content conversion is enabled.<p>
194      *
195      * @param conversionMode the content conversion mode string
196      * @return ture or false
197      */

198     public static boolean isConversionEnabled(String JavaDoc conversionMode) {
199
200         boolean value = true;
201         if ((conversionMode == null) || (conversionMode.indexOf(PARAM_DISABLED) != -1)) {
202             value = false;
203         }
204         return value;
205     }
206
207     /**
208      * Converts the given html code according to the settings of this converter.<p>
209      *
210      * @param htmlInput html input stored in an array of bytes
211      * @return array of bytes contining the converted html
212      *
213      * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
214      */

215     public byte[] convertToByte(byte[] htmlInput) throws UnsupportedEncodingException JavaDoc {
216
217         if (m_modeEnabled) {
218             // only do any processing if the conversion is enabled
219
return convertToByte(new String JavaDoc(htmlInput, m_encoding));
220         }
221         return htmlInput;
222     }
223
224     /**
225      * Converts the given html code according to the settings of this converter.<p>
226      *
227      * @param htmlInput html input stored in a string
228      * @return array of bytes contining the converted html
229      *
230      * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
231      */

232     public byte[] convertToByte(String JavaDoc htmlInput) throws UnsupportedEncodingException JavaDoc {
233
234         return convertToString(htmlInput).getBytes(m_encoding);
235     }
236
237     /**
238      * Converts the given html code according to the settings of this converter.<p>
239      *
240      * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
241      *
242      * @param htmlInput html input stored in an array of bytes
243      * @return array of bytes contining the converted html
244      */

245     public byte[] convertToByteSilent(byte[] htmlInput) {
246
247         try {
248             return convertToByte(htmlInput);
249         } catch (Exception JavaDoc e) {
250             if (LOG.isWarnEnabled()) {
251                 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
252             }
253             return htmlInput;
254         }
255     }
256
257     /**
258      * Converts the given html code according to the settings of this converter.<p>
259      *
260      * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
261      *
262      * @param htmlInput html input stored in a string
263      * @return array of bytes contining the converted html
264      */

265     public byte[] convertToByteSilent(String JavaDoc htmlInput) {
266
267         try {
268             return convertToByte(htmlInput.getBytes(m_encoding));
269         } catch (Exception JavaDoc e) {
270             if (LOG.isWarnEnabled()) {
271                 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
272             }
273             try {
274                 return htmlInput.getBytes(m_encoding);
275             } catch (UnsupportedEncodingException JavaDoc e1) {
276                 if (LOG.isWarnEnabled()) {
277                     LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1);
278                 }
279                 return htmlInput.getBytes();
280             }
281         }
282     }
283
284     /**
285      * Converts the given html code according to the settings of this converter.<p>
286      *
287      * @param htmlInput html input stored in an array of bytes
288      * @return string contining the converted html
289      *
290      * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
291      */

292     public String JavaDoc convertToString(byte[] htmlInput) throws UnsupportedEncodingException JavaDoc {
293
294         return convertToString(new String JavaDoc(htmlInput, m_encoding));
295     }
296
297     /**
298      * Converts the given html code according to the settings of this converter.<p>
299      *
300      * @param htmlInput html input stored in a string
301      * @return string contining the converted html
302      *
303      * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
304      */

305     public String JavaDoc convertToString(String JavaDoc htmlInput) throws UnsupportedEncodingException JavaDoc {
306
307         // only do parsing if the mode is not set to disabled
308
if (m_modeEnabled) {
309
310             // do a maximum of 10 loops
311
int max = m_modeWord ? 10 : 1;
312             int count = 0;
313
314             // we may have to do several parsing runs until all tags are removed
315
int oldSize = htmlInput.length();
316             String JavaDoc workHtml = regExp(htmlInput);
317             while (count < max) {
318                 count++;
319
320                 // first add the optional header if in word mode
321
if (m_modeWord) {
322                     workHtml = adjustHtml(workHtml);
323                 }
324                 // now use tidy to parse and format the html
325
workHtml = parse(workHtml, m_encoding);
326                 if (m_modeWord) {
327                     // cut off the line separator, which is always appended
328
workHtml = workHtml.substring(0, workHtml.length() - m_lineSeparatorLength);
329                 }
330
331                 if (workHtml.length() == oldSize) {
332                     // no change in html code after last processing loop
333
workHtml = regExp(workHtml);
334                     break;
335                 }
336                 oldSize = workHtml.length();
337                 workHtml = regExp(workHtml);
338             }
339             if (LOG.isInfoEnabled()) {
340                 LOG.info(Messages.get().getBundle().key(
341                     Messages.LOG_PARSING_RUNS_2,
342                     this.getClass().getName(),
343                     new Integer JavaDoc(count)));
344             }
345             htmlInput = workHtml;
346         }
347
348         return htmlInput;
349     }
350
351     /**
352      * Converts the given html code according to the settings of this converter.<p>
353      *
354      * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
355      *
356      * @param htmlInput html input stored in an array of bytes
357      *
358      * @return string contining the converted html
359      */

360     public String JavaDoc convertToStringSilent(byte[] htmlInput) {
361
362         try {
363             return convertToString(htmlInput);
364         } catch (Exception JavaDoc e) {
365             if (LOG.isWarnEnabled()) {
366                 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
367             }
368             try {
369                 return new String JavaDoc(htmlInput, m_encoding);
370             } catch (UnsupportedEncodingException JavaDoc e1) {
371                 if (LOG.isWarnEnabled()) {
372                     LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1);
373                 }
374                 return new String JavaDoc(htmlInput);
375             }
376         }
377     }
378
379     /**
380      * Converts the given html code according to the settings of this converter.<p>
381      *
382      * If an any error occurs during the conversion process, the original input is returned unmodified.<p>
383      *
384      * @param htmlInput html input stored in string
385      *
386      * @return string contining the converted html
387      */

388     public String JavaDoc convertToStringSilent(String JavaDoc htmlInput) {
389
390         try {
391             return convertToString(htmlInput);
392         } catch (Exception JavaDoc e) {
393             if (LOG.isWarnEnabled()) {
394                 LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e);
395             }
396             return htmlInput;
397         }
398     }
399
400     /**
401      * Returns the encoding used for the html code conversion.<p>
402      *
403      * @return the encoding used for the html code conversion
404      */

405     public String JavaDoc getEncoding() {
406
407         return m_encoding;
408     }
409
410     /**
411      * Adjusts the html input code in WORD mode if nescessary.<p>
412      *
413      * When in WORD mode, the html tag must contain the xmlns:o="urn:schemas-microsoft-com:office:office"
414      * attribute, otherwiese tide will not remove the WORD tags from the document.
415      *
416      * @param htmlInput the html input
417      * @return adjusted html input
418      */

419     private String JavaDoc adjustHtml(String JavaDoc htmlInput) {
420
421         // check if we have some opening and closing html tags
422
if ((htmlInput.toLowerCase().indexOf("<html>") == -1) && (htmlInput.toLowerCase().indexOf("</html>") == -1)) {
423             // add a correct <html> tag for word generated html
424
StringBuffer JavaDoc tmp = new StringBuffer JavaDoc();
425             tmp.append("<html xmlns:o=\"\"><body>");
426             tmp.append(htmlInput);
427             tmp.append("</body></html>");
428             htmlInput = tmp.toString();
429         }
430         return htmlInput;
431     }
432
433     /**
434      * Extracts all mode parameters from the mode property value and stores them in a list.<p>
435      *
436      * Values must be seperated iwth a semicolon.
437      *
438      * @param mode the mode paramter string
439      * @return list with all extracted nodes
440      */

441     private List JavaDoc extractModes(String JavaDoc mode) {
442
443         ArrayList JavaDoc extractedModes = new ArrayList JavaDoc();
444         if (mode != null) {
445             StringTokenizer JavaDoc extract = new StringTokenizer JavaDoc(mode, ";");
446             while (extract.hasMoreTokens()) {
447                 String JavaDoc tok = extract.nextToken();
448                 extractedModes.add(tok);
449             }
450         }
451         return extractedModes;
452     }
453
454     /**
455      * Initializes the CmsHtmlConverter.<p>
456      *
457      * @param encoding the encoding used for the html code conversion
458      * @param mode the mode parameter to select the operation mode of the converter.
459      */

460     private void init(String JavaDoc encoding, String JavaDoc mode) {
461
462         // extract all operation mode
463
List JavaDoc modes = extractModes(mode);
464
465         // confiugurate the tidy depending on the operation mode
466
if (modes.contains(PARAM_ENABLED)) {
467             m_modeEnabled = true;
468         }
469         if (modes.contains(PARAM_XHTML)) {
470             m_modeEnabled = true;
471             m_modeXhtml = true;
472         }
473         if (modes.contains(PARAM_WORD)) {
474             m_modeEnabled = true;
475             m_modeWord = true;
476         }
477
478         // set the encoding
479
m_encoding = encoding;
480
481         // get line separator length
482
m_lineSeparatorLength = System.getProperty("line.separator").length();
483
484         // we need this only if the conversion is enabled
485
if (m_modeEnabled) {
486
487             // create the main tidy object
488
m_tidy = new Tidy();
489
490             // set specified word, xhtml conversion settings
491
m_tidy.setXHTML(m_modeXhtml);
492             m_tidy.setWord2000(m_modeWord);
493
494             // add additional tags
495
// those are required to handle word 2002 (and newer) documents
496
Properties JavaDoc additionalTags = new Properties JavaDoc();
497             additionalTags.put("new-empty-tags", "o:smarttagtype");
498             additionalTags.put("new-inline-tags", "o:smarttagtype");
499             m_tidy.getConfiguration().addProps(additionalTags);
500
501             // set the default tidy configuration
502

503             // set the tidy encoding
504
m_tidy.setInputEncoding(encoding);
505             m_tidy.setOutputEncoding(encoding);
506
507             // disable the tidy meta element in output
508
m_tidy.setTidyMark(false);
509             // disable clean mode
510
m_tidy.setMakeClean(false);
511             // enable num entities
512
m_tidy.setNumEntities(true);
513             // create output of the body only
514
m_tidy.setPrintBodyOnly(true);
515             // force output creation even if there are tidy errors
516
m_tidy.setForceOutput(true);
517             // set tidy to quiet mode to prevent output
518
m_tidy.setQuiet(true);
519             // disable warning output
520
m_tidy.setShowWarnings(false);
521             // allow comments in the output
522
m_tidy.setHideComments(false);
523             // set no line break before a <br>
524
m_tidy.setBreakBeforeBR(false);
525             // dont wrap attribute values
526
m_tidy.setWrapAttVals(false);
527             // warp lines after 100 chars
528
m_tidy.setWraplen(100);
529             // no indentation
530
m_tidy.setSpaces(0);
531
532             if (m_modeWord) {
533                 // create the regexp for cleanup, only used in word clean mode
534
m_clearStyle = new Pattern JavaDoc[m_cleanupPatterns.length];
535                 for (int i = 0; i < m_cleanupPatterns.length; i++) {
536                     m_clearStyle[i] = Pattern.compile(m_cleanupPatterns[i]);
537                 }
538             }
539
540             // create the regexp for replace
541
m_replaceStyle = new Pattern JavaDoc[m_replacePatterns.length];
542             for (int i = 0; i < m_replacePatterns.length; i++) {
543                 m_replaceStyle[i] = Pattern.compile(m_replacePatterns[i]);
544             }
545         }
546     }
547
548     /**
549      * Parses a byte array containing html code with different parsing modes.<p>
550      *
551      * @param htmlInput a byte array containing raw html code
552      * @param encoding the encoding
553      *
554      * @return parsed and cleared html code
555      *
556      * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported
557      */

558     private String JavaDoc parse(String JavaDoc htmlInput, String JavaDoc encoding) throws UnsupportedEncodingException JavaDoc {
559
560         // prepare the streams
561
ByteArrayInputStream JavaDoc in = new ByteArrayInputStream JavaDoc(htmlInput.getBytes(encoding));
562         ByteArrayOutputStream JavaDoc out = new ByteArrayOutputStream JavaDoc();
563         // do the parsing
564
m_tidy.parse(in, out);
565         // return the result
566
byte[] result = out.toByteArray();
567         return new String JavaDoc(result, encoding);
568     }
569
570     /**
571      * Parses the htmlInput with regular expressions for cleanup purposes.<p>
572      *
573      * @param htmlInput the html input
574      * @return processed html
575      */

576     private String JavaDoc regExp(String JavaDoc htmlInput) {
577
578         String JavaDoc parsedHtml = htmlInput.trim();
579
580         if (m_modeWord) {
581             // process all cleanup regexp
582
for (int i = 0; i < m_cleanupPatterns.length; i++) {
583                 parsedHtml = m_clearStyle[i].matcher(parsedHtml).replaceAll("");
584             }
585         }
586
587         // process all replace regexp
588
for (int i = 0; i < m_replacePatterns.length; i++) {
589             parsedHtml = m_replaceStyle[i].matcher(parsedHtml).replaceAll(m_replaceValues[i]);
590         }
591
592         return parsedHtml;
593     }
594 }
Popular Tags