KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > jahia > services > htmlparser > NekoHtmlParser


1 package org.jahia.services.htmlparser;
2
3 import java.io.*;
4 import java.util.*;
5
6 import javax.xml.transform.*;
7 import javax.xml.transform.dom.*;
8 import javax.xml.transform.stream.*;
9
10 import org.apache.xalan.templates.*;
11 import org.cyberneko.html.parsers.*;
12 import org.jahia.utils.fileparsers.*;
13 import org.w3c.dom.*;
14 import org.jahia.utils.JahiaTools;
15
16 /**
17  *
18  * <p>Title: Html Parser default implementation based on Neko Html Parser</p>
19  * <p>Description: </p>
20  * <p>Copyright: Copyright (c) 2002</p>
21  * <p>Company: </p>
22  * @author Khue Nguyen
23  * @version 1.0
24  */

25 public class NekoHtmlParser implements HtmlParser {
26
27     public static String JavaDoc AMPERSAND = "$$$amp$$$";
28
29     private static org.apache.log4j.Logger logger =
30                 org.apache.log4j.Logger.getLogger(NekoHtmlParser.class);
31
32     public NekoHtmlParser(){}
33
34     /**
35      *
36      * @param htmlParserService HtmlParserService
37      */

38     public void init(HtmlParserService htmlParserService){
39
40     }
41
42     /**
43      * Parses and generates a clean html document, remove unwanted markups,..
44      * Using default settings
45      *
46      * @param inputString
47      * @param DOMVisitors
48      * @return
49      */

50     public String JavaDoc parse(String JavaDoc inputString, Vector DOMVisitors){
51         return parse(inputString,-1,DOMVisitors);
52     }
53
54     /**
55      * Parses and generates a clean html document, remove unwanted markups,..
56      * Using settings as defined for a given site
57      *
58      * @param inputString
59      * @param DOMVisitors
60      * @param siteKey
61      * @return
62      */

63     public String JavaDoc parse(String JavaDoc inputString, Vector DOMVisitors,
64                         int siteId){
65         if ( inputString == null || inputString.trim().equals("") ){
66             return inputString;
67         }
68         return parse(inputString,siteId,DOMVisitors);
69     }
70
71     /**
72      * Parses and generates a clean html document, remove unwanted markups,..
73      * Using settings as defined for a given site
74      *
75      * @param input
76      * @param siteKey
77      * @param DOMVisitors
78      * @return
79      */

80     public static String JavaDoc parse( String JavaDoc input,
81                                 int siteId,
82                                 Vector DOMVisitors){
83
84         if ( input == null || "".equals(input.trim())){
85             return input;
86         }
87
88         String JavaDoc result = new String JavaDoc(input);
89         result = JahiaTools.replacePattern(result, "&", AMPERSAND);
90
91         ByteArrayInputStream strIn;
92         ByteArrayOutputStream strOut = new ByteArrayOutputStream();
93         byte[] strByte = null;
94         String JavaDoc charSet = null; // by default open as ascii
95
CharsetDetection charsetDet = new CharsetDetection();
96         try {
97             strByte = org.apache.commons.io.IOUtils.toByteArray(result);
98             strIn = new ByteArrayInputStream(strByte);
99             charsetDet.charsetDetection(strIn);
100             charSet = charsetDet.getCharset();
101         } catch ( Throwable JavaDoc t ){
102         }
103
104         DOMParser domParser = new DOMParser();
105         Document doc;
106         int size = 0;
107         try {
108             if ( charSet == null ){
109                 strByte = result.getBytes();
110             } else {
111                 strByte = result.getBytes(charSet);
112             }
113             strIn = new ByteArrayInputStream(strByte);
114             org.xml.sax.InputSource JavaDoc in = new org.xml.sax.InputSource JavaDoc(strIn);
115             domParser.setProperty("http://cyberneko.org/html/properties/default-encoding", charSet);
116             domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
117             domParser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
118             domParser.parse(in);
119             doc = domParser.getDocument();
120
121             size = DOMVisitors.size();
122             for (int i = 0; i < size; i++) {
123                 HtmlDOMVisitor visitor = (HtmlDOMVisitor) DOMVisitors.get(i);
124                 doc = visitor.parseDOM(doc);
125             }
126
127             doc.normalize();
128             TransformerFactory tfactory = TransformerFactory.newInstance();
129
130             // This creates a transformer that does a simple identity transform,
131
// and thus can be used for all intents and purposes as a serializer.
132
Transformer serializer = tfactory.newTransformer();
133
134             serializer.setOutputProperty(OutputKeys.METHOD, "html");
135             serializer.setOutputProperty(OutputKeys.INDENT, "yes");
136             if ( charSet != null ){
137                 serializer.setOutputProperty(OutputKeys.ENCODING, charSet);
138             }
139             //serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
140
serializer.setOutputProperty(OutputProperties.S_KEY_INDENT_AMOUNT, "2");
141             serializer.transform (new DOMSource(doc),
142                                  new StreamResult(strOut));
143             if ( charSet == null ){
144                 result = strOut.toString();
145             } else {
146                 result = strOut.toString(charSet);
147             }
148
149             result = JahiaTools.text2XMLEntityRef(result, 1);
150             result = JahiaTools.replacePattern(result, AMPERSAND, "&");
151
152         } catch ( Throwable JavaDoc t ){
153             logger.debug(t);
154             return input;
155         }
156         return result;
157    }
158 }
159
Popular Tags