KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > htmlcleaner > NekoHtmlParser


1 /*
2  * Copyright 2004 Outerthought bvba and Schaubroeck nv
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.outerj.daisy.htmlcleaner;
17
18 import org.xml.sax.*;
19 import org.xml.sax.helpers.AttributesImpl JavaDoc;
20 import org.cyberneko.html.parsers.SAXParser;
21 import org.outerj.daisy.xmlutil.SaxBuffer;
22
23 import java.io.IOException JavaDoc;
24 import java.io.StringReader JavaDoc;
25
26 /**
27  * Parses HTML files using the Neko HTML parser. Puts all elements and attribute
28  * names to lowercase, removes all namespaces, produces well-formed XML.
29  */

30 class NekoHtmlParser {
31     public SaxBuffer parse(String JavaDoc html) throws IOException JavaDoc, SAXException {
32         if (html == null)
33             throw new NullPointerException JavaDoc("html string argument is required.");
34
35         InputSource is = new InputSource();
36         is.setCharacterStream(new StringReader JavaDoc(html));
37
38         SAXParser parser = new SAXParser();
39         parser.setFeature("http://xml.org/sax/features/namespaces", true);
40         parser.setFeature("http://cyberneko.org/html/features/override-namespaces", false);
41         parser.setFeature("http://cyberneko.org/html/features/insert-namespaces", false);
42         parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
43         parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8");
44         parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
45         parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
46
47         SaxBuffer buffer = new SaxBuffer();
48         parser.setContentHandler(new RemoveNamespacesHandler(new MergeCharacterEventsHandler(buffer)));
49         parser.parse(is);
50
51         return buffer;
52     }
53
54     /**
55      * A ContentHandler that drops all namespace information.
56      */

57     static class RemoveNamespacesHandler implements ContentHandler {
58         private ContentHandler consumer;
59
60         public RemoveNamespacesHandler(ContentHandler consumer) {
61             this.consumer = consumer;
62         }
63
64         public void endDocument() throws SAXException {
65             consumer.endDocument();
66         }
67
68         public void startDocument() throws SAXException {
69             consumer.startDocument();
70         }
71
72         public void characters(char ch[], int start, int length) throws SAXException {
73             consumer.characters(ch, start, length);
74         }
75
76         public void ignorableWhitespace(char ch[], int start, int length) throws SAXException {
77             consumer.ignorableWhitespace(ch, start, length);
78         }
79
80         public void endPrefixMapping(String JavaDoc prefix) throws SAXException {
81             // dropped on purpose
82
}
83
84         public void skippedEntity(String JavaDoc name) throws SAXException {
85             // dropped on purpose
86
}
87
88         public void setDocumentLocator(Locator locator) {
89             consumer.setDocumentLocator(locator);
90         }
91
92         public void processingInstruction(String JavaDoc target, String JavaDoc data) throws SAXException {
93             // dropped on purpose
94
}
95
96         public void startPrefixMapping(String JavaDoc prefix, String JavaDoc uri) throws SAXException {
97             // dropped on purpose
98
}
99
100         public void endElement(String JavaDoc namespaceURI, String JavaDoc localName, String JavaDoc qName) throws SAXException {
101             consumer.endElement("", localName, localName);
102         }
103
104         public void startElement(String JavaDoc namespaceURI, String JavaDoc localName, String JavaDoc qName, Attributes atts) throws SAXException {
105             AttributesImpl JavaDoc newAtts = new AttributesImpl JavaDoc(atts);
106             for (int i = 0; i < atts.getLength(); i++) {
107                 newAtts.setURI(i, "");
108                 newAtts.setQName(i, newAtts.getLocalName(i));
109             }
110             consumer.startElement("", localName, localName, atts);
111         }
112     }
113
114 }
115
Popular Tags