KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > enhydra > xml > xmlc > html > parsers > tidy > TidyHTMLParser


1 /*
2  * Enhydra Java Application Server Project
3  *
4  * The contents of this file are subject to the Enhydra Public License
5  * Version 1.1 (the "License"); you may not use this file except in
6  * compliance with the License. You may obtain a copy of the License on
7  * the Enhydra web site ( http://www.enhydra.org/ ).
8  *
9  * Software distributed under the License is distributed on an "AS IS"
10  * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
11  * the License for the specific terms governing rights and limitations
12  * under the License.
13  *
14  * The Initial Developer of the Enhydra Application Server is Lutris
15  * Technologies, Inc. The Enhydra Application Server and portions created
16  * by Lutris Technologies, Inc. are Copyright Lutris Technologies, Inc.
17  * All Rights Reserved.
18  *
19  * Contributor(s):
20  *
21  * $Id: TidyHTMLParser.java,v 1.3 2005/01/26 08:29:24 jkjome Exp $
22  */

23
24 package org.enhydra.xml.xmlc.html.parsers.tidy;
25
26 import java.io.ByteArrayInputStream JavaDoc;
27 import java.io.ByteArrayOutputStream JavaDoc;
28 import java.io.IOException JavaDoc;
29 import java.io.InputStream JavaDoc;
30 import java.io.OutputStreamWriter JavaDoc;
31 import java.io.Reader JavaDoc;
32
33 import org.enhydra.xml.io.ErrorReporter;
34 import org.enhydra.xml.io.InputSourceOps;
35 import org.enhydra.xml.xmlc.XMLCError;
36 import org.enhydra.xml.xmlc.XMLCException;
37 import org.enhydra.xml.xmlc.dom.XMLCDocument;
38 import org.enhydra.xml.xmlc.dom.XMLCDomFactory;
39 import org.enhydra.xml.xmlc.html.parsers.HTMLDocBuilder;
40 import org.enhydra.xml.xmlc.html.parsers.HTMLParserBase;
41 import org.enhydra.xml.xmlc.metadata.HTMLAttrDef;
42 import org.enhydra.xml.xmlc.metadata.HTMLSection;
43 import org.enhydra.xml.xmlc.metadata.HTMLTagDef;
44 import org.enhydra.xml.xmlc.metadata.MetaData;
45 import org.enhydra.xml.xmlc.metadata.ParserType;
46 import org.enhydra.xml.xmlc.misc.LineNumberMap;
47 import org.enhydra.xml.xmlc.parsers.ParseTracer;
48 import org.enhydra.xml.xmlc.parsers.XMLCParser;
49 import org.w3c.tidy.AttVal;
50 import org.w3c.tidy.Attribute;
51 import org.w3c.tidy.AttributeTable;
52 import org.w3c.tidy.Configuration;
53 import org.w3c.tidy.Dict;
54 import org.w3c.tidy.Parser;
55 import org.w3c.tidy.ParserImpl;
56 import org.w3c.tidy.TagTable;
57 import org.w3c.tidy.Tidy;
58 import org.xml.sax.InputSource JavaDoc;
59
60 /**
61  * XMLCParser object for HTML and HTML framesets that uses the Java version of
62  * the W3C HTML tidy program. It uses Tidy to convert HTML to XHTML and then
63  * parses it with an XML parser.
64  */

65 public class TidyHTMLParser extends HTMLParserBase implements XMLCParser {
66     /**
67      * Verbose tracing.
68      */

69     private ParseTracer fTracer;
70
71     /**
72      * The document builder.
73      */

74     private HTMLDocBuilder fDocBuilder;
75
76     /**
77      * Tidy parser.
78      */

79     private Tidy fTidy = new Tidy();
80
81     /**
82      * Constructor.
83      */

84     public TidyHTMLParser() throws XMLCException {
85         fTidy.setTidyMark(false); // Don't add Tidy META tag.
86
fTidy.setQuiet(true); // Doc print guess HTML version
87
}
88
89     /**
90      * Get the type of a Tidy node as a string.
91      */

92     private String JavaDoc nodeTypeStr(short typeId) {
93         switch (typeId) {
94         case org.w3c.tidy.Node.RootNode: return "RootNode";
95         case org.w3c.tidy.Node.DocTypeTag: return "DocTypeTag";
96         case org.w3c.tidy.Node.CommentTag: return "CommentTag";
97         case org.w3c.tidy.Node.ProcInsTag: return "ProcInsTag";
98         case org.w3c.tidy.Node.TextNode: return "TextNode";
99         case org.w3c.tidy.Node.StartTag: return "StartTag";
100         case org.w3c.tidy.Node.EndTag: return "EndTag";
101         case org.w3c.tidy.Node.StartEndTag: return "StartEndTag";
102         case org.w3c.tidy.Node.AspTag: return "AspTag";
103         default: return "**Unknown Node**";
104         }
105     }
106
107     /**
108      * Print information about a Tidy node.
109      */

110     private void printNodeInfo(org.w3c.tidy.Node tNode) {
111         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
112         buf.append(nodeTypeStr(tNode.getType()) + ":");
113         if (tNode.getElement() != null) {
114             buf.append(" " + tNode.getElement());
115         }
116         // Print attributes
117
AttVal attVal = tNode.getAttributes();
118         while (attVal != null) {
119             buf.append(" " + attVal.attribute + "=\"" + attVal.value + "\"");
120             attVal = attVal.next;
121         }
122
123         // Print associated text
124
if (tNode.getType() != org.w3c.tidy.Node.StartTag) {
125             String JavaDoc value = tNode.getNodeValue();
126             if (value != null) {
127                 buf.append(" '" + value + "'");
128             }
129         }
130         fTracer.trace(buf.toString());
131     }
132
133     /**
134      * Recursively print information about a Tidy node.
135      */

136     private void printNode(org.w3c.tidy.Node tNode) {
137         printNodeInfo(tNode);
138         org.w3c.tidy.Node next = tNode.getContent();
139         fTracer.enter();
140         while (next != null) {
141             printNode(next);
142             next = next.getNext();
143         }
144         fTracer.leave();
145     }
146
147     /**
148      * Create an Element node from a Tidy node.
149      */

150     private void createElement(org.w3c.tidy.Node tNode) {
151         if (tNode.getElement() == null) {
152             return;
153         }
154         fDocBuilder.startElement(tNode.getElement());
155         
156         AttVal attVal = tNode.getAttributes();
157         while (attVal != null) {
158             String JavaDoc value = (attVal.value == null) ? "" : attVal.value;
159             fDocBuilder.addAttribute(attVal.attribute, value);
160             attVal = attVal.next;
161         }
162     }
163
164     /**
165      * Create a DOM node from a Tidy node.
166      *
167      * @param tNode Tidy node.
168      */

169     private void makeDomNode(org.w3c.tidy.Node tNode) {
170         switch (tNode.getType()) {
171         case org.w3c.tidy.Node.CommentTag:
172             if (tNode.getNodeValue() != null) {
173                 fDocBuilder.addComment(tNode.getNodeValue());
174             }
175             break;
176         case org.w3c.tidy.Node.TextNode:
177             if (tNode.getNodeValue() != null) {
178                 fDocBuilder.addTextNode(tNode.getNodeValue());
179             }
180             break;
181         case org.w3c.tidy.Node.StartTag:
182         case org.w3c.tidy.Node.StartEndTag:;
183             createElement(tNode);
184             break;
185         case org.w3c.tidy.Node.EndTag:
186             new XMLCError("Internal error: Unexpected Tidy EndTag node");
187             break;
188         case org.w3c.tidy.Node.RootNode:
189         case org.w3c.tidy.Node.DocTypeTag:
190         case org.w3c.tidy.Node.ProcInsTag:
191         case org.w3c.tidy.Node.AspTag:
192             break;
193         default:
194             throw new XMLCError("Internal error: Unknown node");
195         }
196     }
197
198     /**
199      * Recursively convert a node of the Tidy tree to a DOM tree.
200      */

201     private void buildNode(org.w3c.tidy.Node tNode) {
202
203         makeDomNode(tNode);
204
205         // Recurse to process children.
206
org.w3c.tidy.Node tNext = tNode.getContent();
207         while (tNext != null) {
208             buildNode(tNext);
209             tNext = tNext.getNext();
210         }
211         if (tNode.getType() == org.w3c.tidy.Node.StartTag) {
212             fDocBuilder.finishElement();
213         }
214     }
215
216     /**
217      * Check for a Tidy-supported encoding. If one is found,
218      * set it as the encoding.
219      * @param htmlEncoding Encoding to check, null is assumed to
220      * be ASCII.
221      * @return true if encoding is Tidy-supported, false if its
222      * not.
223      */

224     private boolean checkForTidyEncoding(String JavaDoc htmlEncoding) {
225         if ((htmlEncoding == null)
226             || (htmlEncoding.equalsIgnoreCase("US-ASCII"))
227             || (htmlEncoding.equalsIgnoreCase("ASCII"))) {
228             fTidy.setCharEncoding(Configuration.ASCII);
229             return true;
230         } else if (htmlEncoding.equalsIgnoreCase("ISO-8859-1")) {
231             fTidy.setCharEncoding(Configuration.LATIN1);
232             return true;
233         } else if (htmlEncoding.equalsIgnoreCase("UTF-8")) {
234             fTidy.setCharEncoding(Configuration.UTF8);
235             return true;
236         } else if (htmlEncoding.equalsIgnoreCase("ISO-2022-JP")) {
237             fTidy.setCharEncoding(Configuration.ISO2022);
238             return true;
239         } else {
240             return false;
241         }
242     }
243
244     /**
245      * Convert the input to a UTF-8 byte array and make it into
246      * an input stream.
247      */

248     private InputStream JavaDoc makeUTF8InputStream(InputSource JavaDoc input)
249         throws IOException JavaDoc {
250         
251         // Read into a UTF byte array.
252
Reader JavaDoc reader = InputSourceOps.open(input);
253         try {
254             ByteArrayOutputStream JavaDoc utf8Bytes = new ByteArrayOutputStream JavaDoc();
255             OutputStreamWriter JavaDoc writer = new OutputStreamWriter JavaDoc(utf8Bytes, "UTF-8");
256             char buffer[] = new char[4096];
257             int readSize;
258             while ((readSize = reader.read(buffer)) >= 0) {
259                 writer.write(buffer, 0, readSize);
260             }
261             writer.flush();
262
263             // Now we can read it with Tidy
264
fTidy.setCharEncoding(Configuration.UTF8);
265             return new ByteArrayInputStream JavaDoc(utf8Bytes.toByteArray());
266             
267         } finally {
268             InputSourceOps.closeIfOpened(input, reader);
269         }
270     }
271
272     /**
273      * Get the input stream. If the specified encoding is not one supported
274      * by Jtidy or a character stream is open, the file is read into a byte
275      * array of UTF characters and an input stream wrapped around that array.
276      * FIXME: If JTidy work on character streams, this mess could all go
277      * away.
278      */

279     private InputStream JavaDoc getInputStream(InputSource JavaDoc input) throws IOException JavaDoc {
280         String JavaDoc htmlEncoding = input.getEncoding();
281         if (!checkForTidyEncoding(htmlEncoding)) {
282             // Encoding not supported by tidy.
283
return makeUTF8InputStream(input);
284         }
285         if (input.getByteStream() != null) {
286             // Byte stream already open, use it.
287
return input.getByteStream();
288         }
289         if (input.getCharacterStream() != null) {
290             // Need to convert back to a byte stream.
291
return makeUTF8InputStream(input);
292         }
293         // Open it ourselves.
294
return InputSourceOps.openSystemId(input.getSystemId());
295     }
296
297     /**
298      * Specify the list of proprietary tags and attributes.
299      */

300     private void setProprietaryTags(HTMLSection htmlSection) throws XMLCException {
301         //FIXME: Ouch, this is global to tidy; problem if invoking multiple times.
302

303         HTMLTagDef[] tagDefs = htmlSection.getHTMLTagDefs();
304         TagTable tagTable = fTidy.getConfiguration().getTagTable();
305         for (int idx = 0; idx < tagDefs.length; idx++) {
306             addTag(tagTable, tagDefs[idx]);
307         }
308     
309         HTMLAttrDef[] attrDefs = htmlSection.getHTMLAttrDefs();
310         AttributeTable attributeTable = AttributeTable.getDefaultAttributeTable();
311         for (int idx = 0; idx < attrDefs.length; idx++) {
312             attributeTable.install(new Attribute(attrDefs[idx].getName().toLowerCase(),
313                                                  Dict.VERS_PROPRIETARY,
314                                                  null));
315         }
316     }
317
318     /**
319      * @see XMLCParser#parse
320      */

321     public XMLCDocument parse(InputSource JavaDoc input,
322                               LineNumberMap lineNumberMap,
323                               XMLCDomFactory domFactory,
324                               MetaData metaData,
325                               ErrorReporter errorReporter,
326                               ParseTracer tracer)
327         throws IOException JavaDoc, XMLCException {
328
329         validateConf(ParserType.TIDY, metaData);
330
331         fTracer = tracer;
332         fDocBuilder = new HTMLDocBuilder(domFactory, input);
333
334         setProprietaryTags(metaData.getHTMLSection());
335         fTidy.setInputStreamName(input.getSystemId());
336         fTidy.setErrout(new TidyErrorHandler(errorReporter,
337                                              input.getSystemId(),
338                                              lineNumberMap));
339
340         InputStream JavaDoc srcFileStream = getInputStream(input);
341         org.w3c.tidy.Node tRoot;
342         try {
343             tRoot = fTidy.parse(srcFileStream, null);
344         } finally {
345             // Close if was not open in InputSource
346
if (!InputSourceOps.isOpen(input)) {
347                 srcFileStream.close();
348             }
349         }
350         //FIXME: need to get error count from parser..
351
if (errorReporter.getErrorCnt() != 0) {
352             handleParseErrors(errorReporter);
353         }
354         
355         if ((fTracer != null) && fTracer.enabled()) {
356             printNode(tRoot);
357         }
358
359         // Build DOM
360
buildNode(tRoot);
361         addPCDataContentElements(fDocBuilder.getXMLCDocument());
362         return fDocBuilder.getXMLCDocument();
363     }
364
365     /**
366      * Add a proprietary tag to the set of allowed tags.
367      */

368     private void addTag(TagTable tagTable,
369                         HTMLTagDef tagDef) throws XMLCException {
370         Parser tagParser = null;
371         int model = 0;
372
373         // Get basic content model, alow multiple to be combine,
374
// although this only makes sense with empty and the others.
375
if (tagDef.getEmpty()) {
376             model |= Dict.CM_EMPTY|Dict.CM_OPT;
377             tagParser = ParserImpl.getParseInline();
378         }
379         if (tagDef.getInline()) {
380             model |= Dict.CM_INLINE;
381             tagParser = ParserImpl.getParseInline();
382         }
383         if (tagDef.getBlock()) {
384             model |= Dict.CM_BLOCK;
385             tagParser = ParserImpl.getParseBlock();
386         }
387         if (model == 0) {
388             throw new XMLCException("must specify at least one on TAG_CM_EMPTY, TAG_CM_INLINE, or TAG_CM_BLOCK");
389         }
390         
391         // Is close optional?
392
if (tagDef.getOptclose()) {
393             model |= Dict.CM_OPT;
394         }
395
396         // Let them occur anywhere and contain anything
397
model |= Dict.CM_HEAD|Dict.CM_HTML|Dict.CM_MIXED;
398
399         tagTable.install(new Dict(tagDef.getName().toLowerCase(),
400                                   Dict.VERS_PROPRIETARY,
401                                   model, tagParser, null));
402     }
403 }
404
Popular Tags