KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > util > CmsHtmlParser


1 /*
2  * File : $Source: /usr/local/cvs/opencms/src/org/opencms/util/CmsHtmlParser.java,v $
3  * Date : $Date: 2006/09/19 14:29:08 $
4  * Version: $Revision: 1.3 $
5  *
6  * This library is part of OpenCms -
7  * the Open Source Content Mananagement System
8  *
9  * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * For further information about Alkacon Software GmbH, please see the
22  * company website: http://www.alkacon.com
23  *
24  * For further information about OpenCms, please see the
25  * project website: http://www.opencms.org
26  *
27  * You should have received a copy of the GNU Lesser General Public
28  * License along with this library; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30  */

31
32 package org.opencms.util;
33
34 import org.opencms.jsp.parse.DivTag;
35
36 import java.util.ArrayList JavaDoc;
37 import java.util.Arrays JavaDoc;
38 import java.util.Iterator JavaDoc;
39 import java.util.List JavaDoc;
40
41 import org.htmlparser.Parser;
42 import org.htmlparser.PrototypicalNodeFactory;
43 import org.htmlparser.Remark;
44 import org.htmlparser.Tag;
45 import org.htmlparser.Text;
46 import org.htmlparser.lexer.Lexer;
47 import org.htmlparser.lexer.Page;
48 import org.htmlparser.tags.Div;
49 import org.htmlparser.util.ParserException;
50 import org.htmlparser.visitors.NodeVisitor;
51
52 /**
53  * Base utility class for OpenCms <code>{@link org.htmlparser.visitors.NodeVisitor}</code>
54  * implementations, which provides some often used utility functions.
55  * <p>
56  *
57  * This base implementation is only a "pass through" class, that is the content is parsed, but the
58  * generated result is exactly identical to the input.
59  * <p>
60  *
61  * @author Alexander Kandzior
62  *
63  * @version $Revision: 1.3 $
64  *
65  * @since 6.2.0
66  */

67 public class CmsHtmlParser extends NodeVisitor implements I_CmsHtmlNodeVisitor {
68
69     /** List of upper case tag name strings of tags that should not be auto-corrected if closing divs are missing. */
70     private List JavaDoc m_noAutoCloseTags;
71
72     /** The array of supported tag names. */
73     // important: don't change the order of these tags in the source, subclasses may expect the tags
74
// at the exact indices give here
75
// if you want to add tags, add them at the end
76
protected static final String JavaDoc[] TAG_ARRAY = new String JavaDoc[] {
77         "H1",
78         "H2",
79         "H3",
80         "H4",
81         "H5",
82         "H6",
83         "P",
84         "DIV",
85         "SPAN",
86         "BR",
87         "OL",
88         "UL",
89         "LI",
90         "TABLE",
91         "TD",
92         "TR",
93         "TH",
94         "THEAD",
95         "TBODY",
96         "TFOOT"};
97
98     /** The list of supported tag names. */
99     protected static final List JavaDoc TAG_LIST = Arrays.asList(TAG_ARRAY);
100
101     /** Indicates if "echo" mode is on, that is all content is written to the result by default. */
102     protected boolean m_echo;
103
104     /** The buffer to write the out to. */
105     protected StringBuffer JavaDoc m_result;
106
107     /** The providable configuration - never null by contract of interface. */
108     private String JavaDoc m_configuration = "";
109
110     /**
111      * Creates a new instance of the html converter with echo mode set to <code>false</code>.
112      * <p>
113      */

114     public CmsHtmlParser() {
115
116         this(false);
117     }
118
119     /**
120      * Creates a new instance of the html converter.
121      * <p>
122      *
123      * @param echo indicates if "echo" mode is on, that is all content is written to the result
124      */

125     public CmsHtmlParser(boolean echo) {
126
127         m_result = new StringBuffer JavaDoc(1024);
128         m_echo = echo;
129         m_noAutoCloseTags = new ArrayList JavaDoc(32);
130     }
131
132
133     /**
134      * Degrades Composite tags that do have children in the DOM tree
135      * to simple single tags. This allows to avoid auto correction of unclosed HTML tags.<p>
136      *
137      * @return A node factory that will not autocorrect open tags specified via <code>{@link #setNoAutoCloseTags(List)}</code>
138      */

139     private PrototypicalNodeFactory configureNoAutoCorrectionTags() {
140
141         PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
142
143         String JavaDoc tagName;
144         Iterator JavaDoc it = m_noAutoCloseTags.iterator();
145         Div div = new Div();
146         List JavaDoc divNames = Arrays.asList(div.getIds());
147         while (it.hasNext()) {
148             tagName = ((String JavaDoc)it.next());
149             // div
150
if (divNames.contains(tagName)) {
151                 factory.unregisterTag(new Div());
152                 factory.registerTag(new DivTag());
153             }
154             // TODO: add more tags for flat parsing / non correction of missing closing tags here
155
}
156         return factory;
157     }
158
159     /**
160      * @see org.opencms.util.I_CmsHtmlNodeVisitor#getConfiguration()
161      */

162     public String JavaDoc getConfiguration() {
163
164         return m_configuration;
165     }
166
167     /**
168      * @see org.opencms.util.I_CmsHtmlNodeVisitor#getResult()
169      */

170     public String JavaDoc getResult() {
171
172         return m_result.toString();
173     }
174
175     /**
176      * Returns the HTML for the given tag itself (not the tag content).
177      * <p>
178      *
179      * @param tag the tag to create the HTML for
180      *
181      * @return the HTML for the given tag
182      */

183     public String JavaDoc getTagHtml(Tag tag) {
184
185         StringBuffer JavaDoc result = new StringBuffer JavaDoc(32);
186         result.append('<');
187         result.append(tag.getText());
188         result.append('>');
189         return result.toString();
190     }
191
192     /**
193      * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String)
194      */

195     public String JavaDoc process(String JavaDoc html, String JavaDoc encoding) throws ParserException {
196         m_result = new StringBuffer JavaDoc();
197         Parser parser = new Parser();
198         Lexer lexer = new Lexer();
199
200         // initialize the page with the given charset
201
Page page = new Page(html, encoding);
202         lexer.setPage(page);
203         parser.setLexer(lexer);
204
205         if (m_noAutoCloseTags != null && m_noAutoCloseTags.size() > 0) {
206             // Degrade Composite tags that do have children in the DOM tree
207
// to simple single tags: This allows to finish this tag with openend HTML tags without the effect
208
// that htmlparser will generate the closing tags.
209
PrototypicalNodeFactory factory = configureNoAutoCorrectionTags();
210             lexer.setNodeFactory(factory);
211         }
212
213         // process the page using the given visitor
214
parser.visitAllNodesWith(this);
215         // return the result
216
return getResult();
217     }
218
219     /**
220      *
221      * @see org.opencms.util.I_CmsHtmlNodeVisitor#setConfiguration(java.lang.String)
222      */

223     public void setConfiguration(String JavaDoc configuration) {
224
225         if (CmsStringUtil.isNotEmpty(configuration)) {
226             m_configuration = configuration;
227         }
228
229     }
230
231     /**
232      * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitEndTag(org.htmlparser.Tag)
233      */

234     public void visitEndTag(Tag tag) {
235
236         if (m_echo) {
237             m_result.append(getTagHtml(tag));
238         }
239     }
240
241     /**
242      * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitRemarkNode(org.htmlparser.Remark)
243      */

244     public void visitRemarkNode(Remark remark) {
245
246         if (m_echo) {
247             m_result.append(remark.toHtml());
248         }
249     }
250
251     /**
252      * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitStringNode(org.htmlparser.Text)
253      */

254     public void visitStringNode(Text text) {
255
256         if (m_echo) {
257             m_result.append(text.getText());
258         }
259     }
260
261     /**
262      * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitTag(org.htmlparser.Tag)
263      */

264     public void visitTag(Tag tag) {
265
266         if (m_echo) {
267             m_result.append(getTagHtml(tag));
268         }
269     }
270
271     /**
272      * Collapse HTML whitespace in the given String.
273      * <p>
274      *
275      * @param string the string to collapse
276      *
277      * @return the input String with all HTML whitespace collapsed
278      */

279     protected String JavaDoc collapse(String JavaDoc string) {
280
281         int len = string.length();
282         StringBuffer JavaDoc result = new StringBuffer JavaDoc(len);
283         int state = 0;
284         for (int i = 0; i < len; i++) {
285             char c = string.charAt(i);
286             switch (c) {
287                 // see HTML specification section 9.1 White space
288
// http://www.w3.org/TR/html4/struct/text.html#h-9.1
289
case '\u0020':
290                 case '\u0009':
291                 case '\u000C':
292                 case '\u200B':
293                 case '\r':
294                 case '\n':
295                     if (0 != state) {
296                         state = 1;
297                     }
298                     break;
299                 default:
300                     if (1 == state) {
301                         result.append(' ');
302                     }
303                     state = 2;
304                     result.append(c);
305             }
306         }
307         return result.toString();
308     }
309
310     /**
311      * Returns a list of upper case tag names for which parsing / visiting will not correct missing closing tags.<p>
312      *
313      *
314      * @return a List of upper case tag names for which parsing / visiting will not correct missing closing tags
315      */

316     public List JavaDoc getNoAutoCloseTags() {
317
318         return m_noAutoCloseTags;
319     }
320
321     /**
322      * Sets a list of upper case tag names for which parsing / visiting should not correct missing closing tags.<p>
323      *
324      * @param noAutoCloseTagList a list of upper case tag names for which parsing / visiting
325      * should not correct missing closing tags to set.
326      */

327     public void setNoAutoCloseTags(List JavaDoc noAutoCloseTagList) {
328
329         // ensuring upper case
330
m_noAutoCloseTags.clear();
331         if (noAutoCloseTagList != null) {
332             Iterator JavaDoc it = noAutoCloseTagList.iterator();
333             while (it.hasNext()) {
334                 m_noAutoCloseTags.add(((String JavaDoc)it.next()).toUpperCase());
335             }
336         }
337     }
338 }
Popular Tags