TidyHTMLParser


1   /*
2    * Enhydra Java Application Server Project
3    * 
4    * The contents of this file are subject to the Enhydra Public License
5    * Version 1.1 (the "License"); you may not use this file except in
6    * compliance with the License. You may obtain a copy of the License on
7    * the Enhydra web site ( http://www.enhydra.org/ ).
8    * 
9    * Software distributed under the License is distributed on an "AS IS"
10   * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
11   * the License for the specific terms governing rights and limitations
12   * under the License.
13   * 
14   * The Initial Developer of the Enhydra Application Server is Lutris
15   * Technologies, Inc. The Enhydra Application Server and portions created
16   * by Lutris Technologies, Inc. are Copyright Lutris Technologies, Inc.
17   * All Rights Reserved.
18   * 
19   * Contributor(s):
20   * 
21   * $Id: TidyHTMLParser.java,v 1.3 2005/01/26 08:29:24 jkjome Exp $
22   */
23  
24  package org.enhydra.xml.xmlc.html.parsers.tidy;
25  
26  import java.io.ByteArrayInputStream  ;
27  import java.io.ByteArrayOutputStream  ;
28  import java.io.IOException  ;
29  import java.io.InputStream  ;
30  import java.io.OutputStreamWriter  ;
31  import java.io.Reader  ;
32  
33  import org.enhydra.xml.io.ErrorReporter;
34  import org.enhydra.xml.io.InputSourceOps;
35  import org.enhydra.xml.xmlc.XMLCError;
36  import org.enhydra.xml.xmlc.XMLCException;
37  import org.enhydra.xml.xmlc.dom.XMLCDocument;
38  import org.enhydra.xml.xmlc.dom.XMLCDomFactory;
39  import org.enhydra.xml.xmlc.html.parsers.HTMLDocBuilder;
40  import org.enhydra.xml.xmlc.html.parsers.HTMLParserBase;
41  import org.enhydra.xml.xmlc.metadata.HTMLAttrDef;
42  import org.enhydra.xml.xmlc.metadata.HTMLSection;
43  import org.enhydra.xml.xmlc.metadata.HTMLTagDef;
44  import org.enhydra.xml.xmlc.metadata.MetaData;
45  import org.enhydra.xml.xmlc.metadata.ParserType;
46  import org.enhydra.xml.xmlc.misc.LineNumberMap;
47  import org.enhydra.xml.xmlc.parsers.ParseTracer;
48  import org.enhydra.xml.xmlc.parsers.XMLCParser;
49  import org.w3c.tidy.AttVal;
50  import org.w3c.tidy.Attribute;
51  import org.w3c.tidy.AttributeTable;
52  import org.w3c.tidy.Configuration;
53  import org.w3c.tidy.Dict;
54  import org.w3c.tidy.Parser;
55  import org.w3c.tidy.ParserImpl;
56  import org.w3c.tidy.TagTable;
57  import org.w3c.tidy.Tidy;
58  import org.xml.sax.InputSource  ;
59  
60  /**
61   * XMLCParser object for HTML and HTML framesets that uses the Java version of
62   * the W3C HTML tidy program.  It uses Tidy to convert HTML to XHTML and then
63   * parses it with an XML parser.
64   */
65  public class TidyHTMLParser extends HTMLParserBase implements XMLCParser {
66      /**
67       * Verbose tracing.
68       */
69      private ParseTracer fTracer;
70  
71      /**
72       * The document builder.
73       */
74      private HTMLDocBuilder fDocBuilder;
75  
76      /**
77       * Tidy parser.
78       */
79      private Tidy fTidy = new Tidy();
80  
81      /**
82       * Constructor.
83       */
84      public TidyHTMLParser() throws XMLCException {
85          fTidy.setTidyMark(false); // Don't add Tidy META tag.
86          fTidy.setQuiet(true);  // Doc print guess HTML version
87      }
88  
89      /**
90       * Get the type of a Tidy node as a string.
91       */
92      private String   nodeTypeStr(short typeId) {
93          switch (typeId) {
94          case org.w3c.tidy.Node.RootNode: return "RootNode";
95          case org.w3c.tidy.Node.DocTypeTag: return "DocTypeTag";
96          case org.w3c.tidy.Node.CommentTag: return "CommentTag";
97          case org.w3c.tidy.Node.ProcInsTag: return "ProcInsTag";
98          case org.w3c.tidy.Node.TextNode: return "TextNode";
99          case org.w3c.tidy.Node.StartTag: return "StartTag";
100         case org.w3c.tidy.Node.EndTag: return "EndTag";
101         case org.w3c.tidy.Node.StartEndTag: return "StartEndTag";
102         case org.w3c.tidy.Node.AspTag: return "AspTag";
103         default: return "**Unknown Node**";
104         }
105     }
106 
107     /**
108      * Print information about a Tidy node.
109      */
110     private void printNodeInfo(org.w3c.tidy.Node tNode) {
111         StringBuffer   buf = new StringBuffer  ();
112         buf.append(nodeTypeStr(tNode.getType()) + ":");
113         if (tNode.getElement() != null) {
114             buf.append(" " + tNode.getElement());
115         }
116         // Print attributes
117         AttVal attVal = tNode.getAttributes();
118         while (attVal != null) {
119             buf.append("  " + attVal.attribute + "=\"" + attVal.value + "\"");
120             attVal = attVal.next;
121         }
122 
123         // Print associated text
124         if (tNode.getType() != org.w3c.tidy.Node.StartTag) {
125             String   value = tNode.getNodeValue();
126             if (value != null) {
127                 buf.append(" '" + value + "'");
128             }
129         }
130         fTracer.trace(buf.toString());
131     }
132 
133     /**
134      * Recursively print information about a Tidy node.
135      */
136     private void printNode(org.w3c.tidy.Node tNode) {
137         printNodeInfo(tNode);
138         org.w3c.tidy.Node next = tNode.getContent();
139         fTracer.enter();
140         while (next != null) {
141             printNode(next);
142             next = next.getNext();
143         }
144         fTracer.leave();
145     }
146 
147     /**
148      * Create an Element node from a Tidy node.
149      */
150     private void createElement(org.w3c.tidy.Node tNode) {
151         if (tNode.getElement() == null) {
152             return;
153         }
154         fDocBuilder.startElement(tNode.getElement());
155         
156         AttVal attVal = tNode.getAttributes();
157         while (attVal != null) {
158             String   value = (attVal.value == null) ? "" : attVal.value;
159             fDocBuilder.addAttribute(attVal.attribute, value);
160             attVal = attVal.next;
161         }
162     }
163 
164     /**
165      * Create a DOM node from a Tidy node.
166      * 
167      * @param tNode Tidy node.
168      */
169     private void makeDomNode(org.w3c.tidy.Node tNode) {
170         switch (tNode.getType()) {
171         case org.w3c.tidy.Node.CommentTag:
172             if (tNode.getNodeValue() != null) {
173                 fDocBuilder.addComment(tNode.getNodeValue());
174             }
175             break;
176         case org.w3c.tidy.Node.TextNode:
177             if (tNode.getNodeValue() != null) {
178                 fDocBuilder.addTextNode(tNode.getNodeValue());
179             }
180             break;
181         case org.w3c.tidy.Node.StartTag:
182         case org.w3c.tidy.Node.StartEndTag:;
183             createElement(tNode);
184             break;
185         case org.w3c.tidy.Node.EndTag:
186             new XMLCError("Internal error: Unexpected Tidy EndTag node");
187             break;
188         case org.w3c.tidy.Node.RootNode:
189         case org.w3c.tidy.Node.DocTypeTag:
190         case org.w3c.tidy.Node.ProcInsTag:
191         case org.w3c.tidy.Node.AspTag:
192             break;
193         default: 
194             throw new XMLCError("Internal error: Unknown node");
195         }
196     }
197 
198     /**
199      * Recursively convert a node of the Tidy tree to a DOM tree.
200      */
201     private void buildNode(org.w3c.tidy.Node tNode) {
202 
203         makeDomNode(tNode);
204 
205         // Recurse to process children.
206         org.w3c.tidy.Node tNext = tNode.getContent();
207         while (tNext != null) {
208             buildNode(tNext);
209             tNext = tNext.getNext();
210         }
211         if (tNode.getType() == org.w3c.tidy.Node.StartTag) {
212             fDocBuilder.finishElement();
213         }
214     }
215 
216     /**
217      * Check for a Tidy-supported encoding.  If one is found,
218      * set it as the encoding.
219      * @param htmlEncoding Encoding to check, null is assumed to
220      *  be ASCII.
221      * @return true if encoding is Tidy-supported, false if its
222      *  not.
223      */
224     private boolean checkForTidyEncoding(String   htmlEncoding) {
225         if ((htmlEncoding == null)
226             || (htmlEncoding.equalsIgnoreCase("US-ASCII"))
227             || (htmlEncoding.equalsIgnoreCase("ASCII"))) {
228             fTidy.setCharEncoding(Configuration.ASCII);
229             return true;
230         } else if (htmlEncoding.equalsIgnoreCase("ISO-8859-1")) {
231             fTidy.setCharEncoding(Configuration.LATIN1);
232             return true;
233         } else if (htmlEncoding.equalsIgnoreCase("UTF-8")) {
234             fTidy.setCharEncoding(Configuration.UTF8);
235             return true;
236         } else if (htmlEncoding.equalsIgnoreCase("ISO-2022-JP")) {
237             fTidy.setCharEncoding(Configuration.ISO2022);
238             return true;
239         } else {
240             return false;
241         }
242     }
243 
244     /**
245      * Convert the input to a UTF-8 byte array and make it into
246      * an input stream.
247      */
248     private InputStream   makeUTF8InputStream(InputSource   input)
249         throws IOException   {
250         
251         // Read into a UTF byte array.
252         Reader   reader = InputSourceOps.open(input);
253         try {
254             ByteArrayOutputStream   utf8Bytes = new ByteArrayOutputStream  ();
255             OutputStreamWriter   writer = new OutputStreamWriter  (utf8Bytes, "UTF-8");
256             char buffer[] = new char[4096];
257             int readSize;
258             while ((readSize = reader.read(buffer)) >= 0) {
259                 writer.write(buffer, 0, readSize);
260             }
261             writer.flush();
262 
263             // Now we can read it with Tidy
264             fTidy.setCharEncoding(Configuration.UTF8);
265             return new ByteArrayInputStream  (utf8Bytes.toByteArray());
266             
267         } finally {
268             InputSourceOps.closeIfOpened(input, reader);
269         }
270     }
271 
272     /**
273      * Get the input stream.  If the specified encoding is not one supported
274      * by Jtidy or a character stream is open, the file is read into a byte
275      * array of UTF characters and an input stream wrapped around that array.
276      * FIXME: If JTidy work on character streams, this mess could all go
277      * away.
278      */
279     private InputStream   getInputStream(InputSource   input) throws IOException   {
280         String   htmlEncoding = input.getEncoding();
281         if (!checkForTidyEncoding(htmlEncoding)) {
282             // Encoding not supported by tidy.
283             return makeUTF8InputStream(input);
284         }
285         if (input.getByteStream() != null) {
286             // Byte stream already open, use it.
287             return input.getByteStream();
288         }
289         if (input.getCharacterStream() != null) {
290             // Need to convert back to a byte stream.
291             return makeUTF8InputStream(input);
292         }
293         // Open it ourselves.
294         return InputSourceOps.openSystemId(input.getSystemId());
295     }
296 
297     /**
298      * Specify the list of proprietary tags and attributes.
299      */
300     private void setProprietaryTags(HTMLSection htmlSection) throws XMLCException {
301         //FIXME: Ouch, this is global to tidy; problem if invoking multiple times.
302 
303         HTMLTagDef[] tagDefs = htmlSection.getHTMLTagDefs();
304         TagTable tagTable = fTidy.getConfiguration().getTagTable();
305         for (int idx = 0; idx < tagDefs.length; idx++) {
306             addTag(tagTable, tagDefs[idx]);
307         }
308     
309         HTMLAttrDef[] attrDefs = htmlSection.getHTMLAttrDefs();
310         AttributeTable attributeTable = AttributeTable.getDefaultAttributeTable();
311         for (int idx = 0; idx < attrDefs.length; idx++) {
312             attributeTable.install(new Attribute(attrDefs[idx].getName().toLowerCase(),
313                                                  Dict.VERS_PROPRIETARY,
314                                                  null));
315         }
316     }
317 
318     /**
319      * @see XMLCParser#parse
320      */
321     public XMLCDocument parse(InputSource   input,
322                               LineNumberMap lineNumberMap,
323                               XMLCDomFactory domFactory,
324                               MetaData metaData,
325                               ErrorReporter errorReporter,
326                               ParseTracer tracer)
327         throws IOException  , XMLCException {
328 
329         validateConf(ParserType.TIDY, metaData);
330 
331         fTracer = tracer;
332         fDocBuilder = new HTMLDocBuilder(domFactory, input);
333 
334         setProprietaryTags(metaData.getHTMLSection());
335         fTidy.setInputStreamName(input.getSystemId());
336         fTidy.setErrout(new TidyErrorHandler(errorReporter,
337                                              input.getSystemId(),
338                                              lineNumberMap));
339 
340         InputStream   srcFileStream = getInputStream(input);
341         org.w3c.tidy.Node tRoot;
342         try {
343             tRoot = fTidy.parse(srcFileStream, null);
344         } finally {
345             // Close if was not open in InputSource
346             if (!InputSourceOps.isOpen(input)) {
347                 srcFileStream.close();
348             }
349         }
350         //FIXME: need to get error count from parser..
351         if (errorReporter.getErrorCnt() != 0) {
352             handleParseErrors(errorReporter);
353         }
354         
355         if ((fTracer != null) && fTracer.enabled()) {
356             printNode(tRoot);
357         }
358 
359         // Build DOM
360         buildNode(tRoot);
361         addPCDataContentElements(fDocBuilder.getXMLCDocument());
362         return fDocBuilder.getXMLCDocument();
363     }
364 
365     /**
366      * Add a proprietary tag to the set of allowed tags.
367      */
368     private void addTag(TagTable tagTable,
369                         HTMLTagDef tagDef) throws XMLCException {
370         Parser tagParser = null;
371         int model = 0;
372 
373         // Get basic content model, alow multiple to be combine,
374         // although this only makes sense with empty and the others.
375         if (tagDef.getEmpty()) {
376             model |= Dict.CM_EMPTY|Dict.CM_OPT;
377             tagParser = ParserImpl.getParseInline();
378         }
379         if (tagDef.getInline()) {
380             model |= Dict.CM_INLINE;
381             tagParser =  ParserImpl.getParseInline();
382         }
383         if (tagDef.getBlock()) {
384             model |= Dict.CM_BLOCK;
385             tagParser = ParserImpl.getParseBlock();
386         }
387         if (model == 0) {
388             throw new XMLCException("must specify at least one on TAG_CM_EMPTY, TAG_CM_INLINE, or TAG_CM_BLOCK");
389         }
390         
391         // Is close optional?
392         if (tagDef.getOptclose()) {
393             model |= Dict.CM_OPT;
394         }
395 
396         // Let them occur anywhere and contain anything
397         model |= Dict.CM_HEAD|Dict.CM_HTML|Dict.CM_MIXED;
398 
399         tagTable.install(new Dict(tagDef.getName().toLowerCase(),
400                                   Dict.VERS_PROPRIETARY,
401                                   model, tagParser, null));
402     }
403 }
404
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags