1 23 24 package org.enhydra.xml.xmlc.html.parsers.tidy; 25 26 import java.io.ByteArrayInputStream ; 27 import java.io.ByteArrayOutputStream ; 28 import java.io.IOException ; 29 import java.io.InputStream ; 30 import java.io.OutputStreamWriter ; 31 import java.io.Reader ; 32 33 import org.enhydra.xml.io.ErrorReporter; 34 import org.enhydra.xml.io.InputSourceOps; 35 import org.enhydra.xml.xmlc.XMLCError; 36 import org.enhydra.xml.xmlc.XMLCException; 37 import org.enhydra.xml.xmlc.dom.XMLCDocument; 38 import org.enhydra.xml.xmlc.dom.XMLCDomFactory; 39 import org.enhydra.xml.xmlc.html.parsers.HTMLDocBuilder; 40 import org.enhydra.xml.xmlc.html.parsers.HTMLParserBase; 41 import org.enhydra.xml.xmlc.metadata.HTMLAttrDef; 42 import org.enhydra.xml.xmlc.metadata.HTMLSection; 43 import org.enhydra.xml.xmlc.metadata.HTMLTagDef; 44 import org.enhydra.xml.xmlc.metadata.MetaData; 45 import org.enhydra.xml.xmlc.metadata.ParserType; 46 import org.enhydra.xml.xmlc.misc.LineNumberMap; 47 import org.enhydra.xml.xmlc.parsers.ParseTracer; 48 import org.enhydra.xml.xmlc.parsers.XMLCParser; 49 import org.w3c.tidy.AttVal; 50 import org.w3c.tidy.Attribute; 51 import org.w3c.tidy.AttributeTable; 52 import org.w3c.tidy.Configuration; 53 import org.w3c.tidy.Dict; 54 import org.w3c.tidy.Parser; 55 import org.w3c.tidy.ParserImpl; 56 import org.w3c.tidy.TagTable; 57 import org.w3c.tidy.Tidy; 58 import org.xml.sax.InputSource ; 59 60 65 public class TidyHTMLParser extends HTMLParserBase implements XMLCParser { 66 69 private ParseTracer fTracer; 70 71 74 private HTMLDocBuilder fDocBuilder; 75 76 79 private Tidy fTidy = new Tidy(); 80 81 84 public TidyHTMLParser() throws XMLCException { 85 fTidy.setTidyMark(false); fTidy.setQuiet(true); } 88 89 92 private String nodeTypeStr(short typeId) { 93 switch (typeId) { 94 case org.w3c.tidy.Node.RootNode: return "RootNode"; 95 case org.w3c.tidy.Node.DocTypeTag: return "DocTypeTag"; 96 case org.w3c.tidy.Node.CommentTag: return "CommentTag"; 97 case org.w3c.tidy.Node.ProcInsTag: return "ProcInsTag"; 98 case org.w3c.tidy.Node.TextNode: return "TextNode"; 99 case org.w3c.tidy.Node.StartTag: return "StartTag"; 100 case org.w3c.tidy.Node.EndTag: return "EndTag"; 101 case org.w3c.tidy.Node.StartEndTag: return "StartEndTag"; 102 case org.w3c.tidy.Node.AspTag: return "AspTag"; 103 default: return "**Unknown Node**"; 104 } 105 } 106 107 110 private void printNodeInfo(org.w3c.tidy.Node tNode) { 111 StringBuffer buf = new StringBuffer (); 112 buf.append(nodeTypeStr(tNode.getType()) + ":"); 113 if (tNode.getElement() != null) { 114 buf.append(" " + tNode.getElement()); 115 } 116 AttVal attVal = tNode.getAttributes(); 118 while (attVal != null) { 119 buf.append(" " + attVal.attribute + "=\"" + attVal.value + "\""); 120 attVal = attVal.next; 121 } 122 123 if (tNode.getType() != org.w3c.tidy.Node.StartTag) { 125 String value = tNode.getNodeValue(); 126 if (value != null) { 127 buf.append(" '" + value + "'"); 128 } 129 } 130 fTracer.trace(buf.toString()); 131 } 132 133 136 private void printNode(org.w3c.tidy.Node tNode) { 137 printNodeInfo(tNode); 138 org.w3c.tidy.Node next = tNode.getContent(); 139 fTracer.enter(); 140 while (next != null) { 141 printNode(next); 142 next = next.getNext(); 143 } 144 fTracer.leave(); 145 } 146 147 150 private void createElement(org.w3c.tidy.Node tNode) { 151 if (tNode.getElement() == null) { 152 return; 153 } 154 fDocBuilder.startElement(tNode.getElement()); 155 156 AttVal attVal = tNode.getAttributes(); 157 while (attVal != null) { 158 String value = (attVal.value == null) ? "" : attVal.value; 159 fDocBuilder.addAttribute(attVal.attribute, value); 160 attVal = attVal.next; 161 } 162 } 163 164 169 private void makeDomNode(org.w3c.tidy.Node tNode) { 170 switch (tNode.getType()) { 171 case org.w3c.tidy.Node.CommentTag: 172 if (tNode.getNodeValue() != null) { 173 fDocBuilder.addComment(tNode.getNodeValue()); 174 } 175 break; 176 case org.w3c.tidy.Node.TextNode: 177 if (tNode.getNodeValue() != null) { 178 fDocBuilder.addTextNode(tNode.getNodeValue()); 179 } 180 break; 181 case org.w3c.tidy.Node.StartTag: 182 case org.w3c.tidy.Node.StartEndTag:; 183 createElement(tNode); 184 break; 185 case org.w3c.tidy.Node.EndTag: 186 new XMLCError("Internal error: Unexpected Tidy EndTag node"); 187 break; 188 case org.w3c.tidy.Node.RootNode: 189 case org.w3c.tidy.Node.DocTypeTag: 190 case org.w3c.tidy.Node.ProcInsTag: 191 case org.w3c.tidy.Node.AspTag: 192 break; 193 default: 194 throw new XMLCError("Internal error: Unknown node"); 195 } 196 } 197 198 201 private void buildNode(org.w3c.tidy.Node tNode) { 202 203 makeDomNode(tNode); 204 205 org.w3c.tidy.Node tNext = tNode.getContent(); 207 while (tNext != null) { 208 buildNode(tNext); 209 tNext = tNext.getNext(); 210 } 211 if (tNode.getType() == org.w3c.tidy.Node.StartTag) { 212 fDocBuilder.finishElement(); 213 } 214 } 215 216 224 private boolean checkForTidyEncoding(String htmlEncoding) { 225 if ((htmlEncoding == null) 226 || (htmlEncoding.equalsIgnoreCase("US-ASCII")) 227 || (htmlEncoding.equalsIgnoreCase("ASCII"))) { 228 fTidy.setCharEncoding(Configuration.ASCII); 229 return true; 230 } else if (htmlEncoding.equalsIgnoreCase("ISO-8859-1")) { 231 fTidy.setCharEncoding(Configuration.LATIN1); 232 return true; 233 } else if (htmlEncoding.equalsIgnoreCase("UTF-8")) { 234 fTidy.setCharEncoding(Configuration.UTF8); 235 return true; 236 } else if (htmlEncoding.equalsIgnoreCase("ISO-2022-JP")) { 237 fTidy.setCharEncoding(Configuration.ISO2022); 238 return true; 239 } else { 240 return false; 241 } 242 } 243 244 248 private InputStream makeUTF8InputStream(InputSource input) 249 throws IOException { 250 251 Reader reader = InputSourceOps.open(input); 253 try { 254 ByteArrayOutputStream utf8Bytes = new ByteArrayOutputStream (); 255 OutputStreamWriter writer = new OutputStreamWriter (utf8Bytes, "UTF-8"); 256 char buffer[] = new char[4096]; 257 int readSize; 258 while ((readSize = reader.read(buffer)) >= 0) { 259 writer.write(buffer, 0, readSize); 260 } 261 writer.flush(); 262 263 fTidy.setCharEncoding(Configuration.UTF8); 265 return new ByteArrayInputStream (utf8Bytes.toByteArray()); 266 267 } finally { 268 InputSourceOps.closeIfOpened(input, reader); 269 } 270 } 271 272 279 private InputStream getInputStream(InputSource input) throws IOException { 280 String htmlEncoding = input.getEncoding(); 281 if (!checkForTidyEncoding(htmlEncoding)) { 282 return makeUTF8InputStream(input); 284 } 285 if (input.getByteStream() != null) { 286 return input.getByteStream(); 288 } 289 if (input.getCharacterStream() != null) { 290 return makeUTF8InputStream(input); 292 } 293 return InputSourceOps.openSystemId(input.getSystemId()); 295 } 296 297 300 private void setProprietaryTags(HTMLSection htmlSection) throws XMLCException { 301 303 HTMLTagDef[] tagDefs = htmlSection.getHTMLTagDefs(); 304 TagTable tagTable = fTidy.getConfiguration().getTagTable(); 305 for (int idx = 0; idx < tagDefs.length; idx++) { 306 addTag(tagTable, tagDefs[idx]); 307 } 308 309 HTMLAttrDef[] attrDefs = htmlSection.getHTMLAttrDefs(); 310 AttributeTable attributeTable = AttributeTable.getDefaultAttributeTable(); 311 for (int idx = 0; idx < attrDefs.length; idx++) { 312 attributeTable.install(new Attribute(attrDefs[idx].getName().toLowerCase(), 313 Dict.VERS_PROPRIETARY, 314 null)); 315 } 316 } 317 318 321 public XMLCDocument parse(InputSource input, 322 LineNumberMap lineNumberMap, 323 XMLCDomFactory domFactory, 324 MetaData metaData, 325 ErrorReporter errorReporter, 326 ParseTracer tracer) 327 throws IOException , XMLCException { 328 329 validateConf(ParserType.TIDY, metaData); 330 331 fTracer = tracer; 332 fDocBuilder = new HTMLDocBuilder(domFactory, input); 333 334 setProprietaryTags(metaData.getHTMLSection()); 335 fTidy.setInputStreamName(input.getSystemId()); 336 fTidy.setErrout(new TidyErrorHandler(errorReporter, 337 input.getSystemId(), 338 lineNumberMap)); 339 340 InputStream srcFileStream = getInputStream(input); 341 org.w3c.tidy.Node tRoot; 342 try { 343 tRoot = fTidy.parse(srcFileStream, null); 344 } finally { 345 if (!InputSourceOps.isOpen(input)) { 347 srcFileStream.close(); 348 } 349 } 350 if (errorReporter.getErrorCnt() != 0) { 352 handleParseErrors(errorReporter); 353 } 354 355 if ((fTracer != null) && fTracer.enabled()) { 356 printNode(tRoot); 357 } 358 359 buildNode(tRoot); 361 addPCDataContentElements(fDocBuilder.getXMLCDocument()); 362 return fDocBuilder.getXMLCDocument(); 363 } 364 365 368 private void addTag(TagTable tagTable, 369 HTMLTagDef tagDef) throws XMLCException { 370 Parser tagParser = null; 371 int model = 0; 372 373 if (tagDef.getEmpty()) { 376 model |= Dict.CM_EMPTY|Dict.CM_OPT; 377 tagParser = ParserImpl.getParseInline(); 378 } 379 if (tagDef.getInline()) { 380 model |= Dict.CM_INLINE; 381 tagParser = ParserImpl.getParseInline(); 382 } 383 if (tagDef.getBlock()) { 384 model |= Dict.CM_BLOCK; 385 tagParser = ParserImpl.getParseBlock(); 386 } 387 if (model == 0) { 388 throw new XMLCException("must specify at least one on TAG_CM_EMPTY, TAG_CM_INLINE, or TAG_CM_BLOCK"); 389 } 390 391 if (tagDef.getOptclose()) { 393 model |= Dict.CM_OPT; 394 } 395 396 model |= Dict.CM_HEAD|Dict.CM_HTML|Dict.CM_MIXED; 398 399 tagTable.install(new Dict(tagDef.getName().toLowerCase(), 400 Dict.VERS_PROPRIETARY, 401 model, tagParser, null)); 402 } 403 } 404 | Popular Tags |