1 16 package org.outerj.daisy.htmlcleaner; 17 18 import org.xml.sax.*; 19 import org.xml.sax.helpers.AttributesImpl ; 20 import org.cyberneko.html.parsers.SAXParser; 21 import org.outerj.daisy.xmlutil.SaxBuffer; 22 23 import java.io.IOException ; 24 import java.io.StringReader ; 25 26 30 class NekoHtmlParser { 31 public SaxBuffer parse(String html) throws IOException , SAXException { 32 if (html == null) 33 throw new NullPointerException ("html string argument is required."); 34 35 InputSource is = new InputSource(); 36 is.setCharacterStream(new StringReader (html)); 37 38 SAXParser parser = new SAXParser(); 39 parser.setFeature("http://xml.org/sax/features/namespaces", true); 40 parser.setFeature("http://cyberneko.org/html/features/override-namespaces", false); 41 parser.setFeature("http://cyberneko.org/html/features/insert-namespaces", false); 42 parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true); 43 parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8"); 44 parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); 45 parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower"); 46 47 SaxBuffer buffer = new SaxBuffer(); 48 parser.setContentHandler(new RemoveNamespacesHandler(new MergeCharacterEventsHandler(buffer))); 49 parser.parse(is); 50 51 return buffer; 52 } 53 54 57 static class RemoveNamespacesHandler implements ContentHandler { 58 private ContentHandler consumer; 59 60 public RemoveNamespacesHandler(ContentHandler consumer) { 61 this.consumer = consumer; 62 } 63 64 public void endDocument() throws SAXException { 65 consumer.endDocument(); 66 } 67 68 public void startDocument() throws SAXException { 69 consumer.startDocument(); 70 } 71 72 public void characters(char ch[], int start, int length) throws SAXException { 73 consumer.characters(ch, start, length); 74 } 75 76 public void ignorableWhitespace(char ch[], int start, int length) throws SAXException { 77 consumer.ignorableWhitespace(ch, start, length); 78 } 79 80 public void endPrefixMapping(String prefix) throws SAXException { 81 } 83 84 public void skippedEntity(String name) throws SAXException { 85 } 87 88 public void setDocumentLocator(Locator locator) { 89 consumer.setDocumentLocator(locator); 90 } 91 92 public void processingInstruction(String target, String data) throws SAXException { 93 } 95 96 public void startPrefixMapping(String prefix, String uri) throws SAXException { 97 } 99 100 public void endElement(String namespaceURI, String localName, String qName) throws SAXException { 101 consumer.endElement("", localName, localName); 102 } 103 104 public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { 105 AttributesImpl newAtts = new AttributesImpl (atts); 106 for (int i = 0; i < atts.getLength(); i++) { 107 newAtts.setURI(i, ""); 108 newAtts.setQName(i, newAtts.getLocalName(i)); 109 } 110 consumer.startElement("", localName, localName, atts); 111 } 112 } 113 114 } 115 | Popular Tags |