KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > jahia > services > htmlparser > TidyHtmlParser


1 package org.jahia.services.htmlparser;
2
3 import java.io.*;
4 import java.util.*;
5
6 import javax.xml.parsers.*;
7 import javax.xml.transform.*;
8 import javax.xml.transform.dom.*;
9 import javax.xml.transform.stream.*;
10
11 import org.jahia.registries.*;
12 import org.jahia.settings.*;
13 import org.jahia.utils.*;
14 import org.jahia.utils.fileparsers.*;
15 import org.jahia.utils.properties.*;
16 import org.w3c.dom.*;
17 import org.w3c.tidy.*;
18 import org.xml.sax.*;
19
20 /**
21  *
22  * <p>Title: Html Parser default implementation based on Tidy</p>
23  * <p>Description: </p>
24  * <p>Copyright: Copyright (c) 2002</p>
25  * <p>Company: </p>
26  * @author Khue Nguyen
27  * @version 1.0
28  */

29 public class TidyHtmlParser implements HtmlParser {
30
31     private static org.apache.log4j.Logger logger =
32                 org.apache.log4j.Logger.getLogger(TidyHtmlParser.class);
33
34     public static String JavaDoc AMPERSAND = "$$$amp$$$";
35     public static String JavaDoc AMPERSAND_SECONDPASS = "$$$amp_secondpass$$$";
36     public static String JavaDoc TIDYERRORS_TAG = "TIDYERRORS";
37
38     private static Vector newInlineTags = new Vector();
39     private static Vector newBlockLevelTags = new Vector();
40     private static Vector unrecognizedTags = new Vector();
41
42     private Properties config = new Properties();
43
44     public TidyHtmlParser(){}
45
46     public TidyHtmlParser(Properties config){
47         this.config = config;
48         if ( this.config == null ){
49             this.config = new Properties();
50         }
51     }
52
53     /**
54      *
55      * @param htmlParserService HtmlParserService
56      */

57     public void init(HtmlParserService htmlParserService){
58
59         SettingsBean settings = htmlParserService.getSettingsBean();
60         if ( settings == null ){
61             return;
62         }
63         String JavaDoc fileName = settings.getPropertiesFile().getProperty("tidyConfig");
64         if ( fileName == null || "".equals(fileName.trim()) ){
65             fileName = "tidy.properties";
66         }
67         StringBuffer JavaDoc buff = new StringBuffer JavaDoc(settings.getJahiaEtcDiskPath());
68         buff.append(File.separator);
69         buff.append("config");
70         buff.append(File.separator);
71         buff.append(fileName);
72         try {
73             PropertiesManager propManager = new PropertiesManager(buff.toString());
74             this.config = propManager.getPropertiesObject();
75         } catch ( Throwable JavaDoc t ){
76             logger.debug("Error loading tidy config file, use default settings ",t);
77         }
78         if ( this.config == null ){
79             this.config = new Properties();
80         }
81     }
82
83     /**
84      * Parses and generates a clean html document, remove unwanted markups,..
85      * Using default settings
86      *
87      * @param inputString
88      * @param DOMVisitors
89      * @return
90      */

91     public String JavaDoc parse(String JavaDoc inputString, Vector DOMVisitors){
92         return parse(inputString,-1,config,DOMVisitors);
93     }
94
95     /**
96      * Parses and generates a clean html document, remove unwanted markups,..
97      * Using settings as defined for a given site
98      *
99      * @param inputString
100      * @param DOMVisitors
101      * @param siteKey
102      * @return
103      */

104     public String JavaDoc parse(String JavaDoc inputString, Vector DOMVisitors,
105                         int siteId){
106         if ( inputString == null || inputString.trim().equals("") ){
107             return inputString;
108         }
109         return parse(inputString,siteId,config,DOMVisitors);
110     }
111
112     /**
113      * Parses and generates a clean html document, remove unwanted markups,..
114      * Using settings as defined for a given site
115      *
116      * @param input
117      * @param siteKey
118      * @param tidyConfig
119      * @param DOMVisitors
120      * @return
121      */

122     public static String JavaDoc parse( String JavaDoc input,
123                                 int siteId,
124                                 Properties tidyConfig,
125                                 Vector DOMVisitors){
126
127         if (input == null || "".equals(input.trim())) {
128             return input;
129         }
130         String JavaDoc result = new String JavaDoc(input);
131         result = JahiaTools.replacePattern(result, "&", AMPERSAND);
132
133         ByteArrayInputStream strIn;
134         ByteArrayOutputStream strOut;
135         BufferedInputStream urlIn;
136         Tidy tidy = new Tidy();
137
138         Properties config = (Properties) tidyConfig.clone();
139         String JavaDoc val = tidyConfig.getProperty(TidyConfig.NEW_BLOCK_LEVEL_TAGS);
140         if (val == null) {
141             val = "";
142         }
143         String JavaDoc tag = null;
144         int size = newBlockLevelTags.size();
145         for (int i = 0; i < size; i++) {
146             tag = (String JavaDoc) newBlockLevelTags.get(i);
147             if (val.length() == 0) {
148                 val = tag;
149             } else {
150                 val += "," + tag;
151             }
152         }
153         config.setProperty(TidyConfig.NEW_BLOCK_LEVEL_TAGS, val);
154
155         val = config.getProperty(TidyConfig.NEW_INLINE_TAGS);
156         if (val == null) {
157             val = "";
158         }
159         size = newInlineTags.size();
160         for (int i = 0; i < size; i++) {
161             tag = (String JavaDoc) newInlineTags.get(i);
162             if (val.length() == 0) {
163                 val = tag;
164             } else {
165                 val += "," + tag;
166             }
167         }
168
169         config.setProperty(TidyConfig.NEW_INLINE_TAGS, val);
170
171         // charset
172
byte[] strByte = null;
173         String JavaDoc charSet = null; // by default open as ascii
174
CharsetDetection charsetDet = new CharsetDetection();
175         try {
176             strByte = org.apache.commons.io.IOUtils.toByteArray(result);
177             strIn = new ByteArrayInputStream(strByte);
178             charsetDet.charsetDetection(strIn);
179             charSet = charsetDet.getCharset();
180
181             if ((config.getProperty(TidyConfig.CHAR_ENCODING) == null)
182                 && "UTF-8".equalsIgnoreCase(charSet)) {
183                 config.setProperty(TidyConfig.CHAR_ENCODING, "utf8");
184             }
185         } catch (Throwable JavaDoc t) {
186         }
187
188         tidy.setConfigurationFromProps(config);
189
190         try {
191             if (charSet == null) {
192                 strByte = result.getBytes();
193             } else {
194                 strByte = result.getBytes(charSet);
195             }
196             strIn = new ByteArrayInputStream(strByte);
197             strOut = new ByteArrayOutputStream();
198             ByteArrayOutputStream strErr = new ByteArrayOutputStream();
199             tidy.setErrout(new PrintWriter(strErr, true));
200             tidy.setShowWarnings(false);
201             tidy.parse(strIn, strOut);
202             strIn.reset();
203             String JavaDoc tmpValue = null;
204             if (charSet == null) {
205                 tmpValue = strOut.toString();
206             } else {
207                 tmpValue = strOut.toString(charSet);
208             }
209             tmpValue = JahiaTools.replacePattern(tmpValue, "&",
210                                                  AMPERSAND_SECONDPASS);
211
212             if (tmpValue == null) {
213                 tmpValue = "";
214             }
215             if (!"".equals(tmpValue.trim())) {
216                 if (charSet == null) {
217                     strByte = tmpValue.getBytes();
218                 } else {
219                     strByte = tmpValue.getBytes(charSet);
220                 }
221                 strIn = new ByteArrayInputStream(strByte);
222                 /*
223                 DOMParser domParser = new DOMParser();
224                 strIn = new ByteArrayInputStream(strByte);
225                 org.xml.sax.InputSource in = new org.xml.sax.InputSource(strIn);
226                 domParser.parse(in);
227                 Document doc = domParser.getDocument();
228                 DocumentBuilderFactory dfactory = DocumentBuilderFactory.
229                                                   newInstance();
230                 */

231                DocumentBuilderFactory dfactory = DocumentBuilderFactory.
232                                                  newInstance();
233
234                EntityResolver et = null;
235                try {
236                    et = ServicesRegistry.getInstance().
237                         getJahiaWebAppsDeployerService().getDtdEntityResolver();
238                } catch (Throwable JavaDoc t) {
239                }
240                DocumentBuilder docBuilder = dfactory.newDocumentBuilder();
241                if (et != null) {
242                    docBuilder.setEntityResolver(et);
243                }
244                Document doc = docBuilder.parse(strIn);
245
246                 TagRemover tagRemover =
247                         new TagRemover();
248
249                 synchronized (unrecognizedTags) {
250                     size = unrecognizedTags.size();
251                     for (int i = 0; i < size; i++) {
252                         tagRemover.addTag((String JavaDoc) unrecognizedTags.get(i));
253                     }
254                 }
255                 //tagRemover.addTag("o:p");
256
doc = tagRemover.parseDOM(doc);
257
258                 size = DOMVisitors.size();
259                 for (int i = 0; i < size; i++) {
260                     HtmlDOMVisitor visitor = (HtmlDOMVisitor) DOMVisitors.get(i);
261                     doc = visitor.parseDOM(doc);
262                 }
263
264                 doc.normalize();
265
266                 TransformerFactory tfactory = TransformerFactory.newInstance();
267
268                 // This creates a transformer that does a simple identity transform,
269
// and thus can be used for all intents and purposes as a serializer.
270
Transformer serializer = tfactory.newTransformer();
271
272                 serializer.setOutputProperty(OutputKeys.METHOD, "html");
273                 serializer.setOutputProperty(OutputKeys.INDENT, "yes");
274                 if (charSet != null) {
275                     serializer.setOutputProperty(OutputKeys.ENCODING, charSet);
276                 }
277                 //serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
278
//serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT, "4");
279
strOut.reset();
280                 serializer.transform(new DOMSource(doc),
281                                      new StreamResult(strOut));
282
283                 if (tidy.getParseErrors() > 0) {
284                     result = "<TIDYERRORS>\n" + strErr.toString() +
285                              "</TIDYERRORS>";
286                 } else {
287                     if (charSet == null) {
288                         result = strOut.toString();
289                     } else {
290                         result = strOut.toString(charSet);
291                     }
292                 }
293                 result = JahiaTools.replacePattern(result, AMPERSAND_SECONDPASS,
294                                                    "&");
295                 result = JahiaTools.text2XMLEntityRef(result, 1);
296                 result = JahiaTools.replacePattern(result, AMPERSAND, "&");
297
298             } else if (tidy.getParseErrors() > 0) {
299                 String JavaDoc err = strErr.toString();
300                 result = "<TIDYERRORS>\n" + err + "</TIDYERRORS>";
301                 if (err.indexOf("is not recognized!") != -1) {
302                     err = JahiaTools.replacePatternIgnoreCase(err.toLowerCase(),
303                             " - error: ", "@@@");
304                     String JavaDoc[] errors = org.jahia.utils.JahiaTools.getTokens(
305                             err, "@@@");
306                     if (errors.length > 0) {
307                         String JavaDoc token = "";
308                         ArrayList tags = new ArrayList();
309                         tag = null;
310                         String JavaDoc newInput = input;
311                         int pos = -1;
312                         for (int i = 0; i < errors.length; i++) {
313                             token = errors[i];
314                             pos = token.indexOf(" is not recognized!");
315                             if (pos != -1) {
316                                 try {
317                                     tag = token.substring(0, pos);
318                                     if (!tag.startsWith("<")) {
319                                         // we found an unknown empty tag
320
synchronized (unrecognizedTags) {
321                                             if (unrecognizedTags.contains(tag)) {
322                                                 continue;
323                                             } else {
324                                                 unrecognizedTags.add(tag);
325                                                 newInlineTags.add(tag);
326                                             }
327                                         }
328                                     } else {
329                                         tag = tag.substring(1, tag.length() - 1);
330                                         synchronized (unrecognizedTags) {
331                                             if (unrecognizedTags.contains(tag)) {
332                                                 continue;
333                                             } else {
334                                                 unrecognizedTags.add(tag);
335                                                 newBlockLevelTags.add(tag);
336                                             }
337                                         }
338                                     }
339                                 } catch (Throwable JavaDoc t) {
340                                 }
341                             }
342                         }
343                         result = parse(input, siteId, tidyConfig,
344                                        DOMVisitors);
345                     }
346                 }
347             }
348         } catch (Exception JavaDoc e) {
349             e.printStackTrace();
350             return input;
351         }
352         return result;
353    }
354
355 }
356
Popular Tags