| 1 16 package org.outerj.daisy.htmlcleaner; 17 18 import org.xml.sax.SAXException ; 19 import org.xml.sax.Attributes ; 20 import org.xml.sax.ContentHandler ; 21 import org.xml.sax.helpers.AttributesImpl ; 22 import org.outerj.daisy.xmlutil.SaxBuffer; 23 24 import java.util.*; 25 26 46 class HtmlRepairer { 47 private HtmlCleanerTemplate template; 48 53 private static Set wipeableEmptyElements; 54 static { 55 wipeableEmptyElements = new HashSet(); 56 wipeableEmptyElements.add("strong"); 57 wipeableEmptyElements.add("em"); 58 wipeableEmptyElements.add("sub"); 59 wipeableEmptyElements.add("sup"); 60 wipeableEmptyElements.add("a"); 61 wipeableEmptyElements.add("tt"); 62 wipeableEmptyElements.add("ul"); 63 wipeableEmptyElements.add("del"); 64 } 65 private static final char[] NEWLINE = new char[] { '\n' }; 66 private static Set contentBlockElements; 67 static { 68 contentBlockElements = new HashSet(); 69 contentBlockElements.add("p"); 70 contentBlockElements.add("h1"); 71 contentBlockElements.add("h2"); 72 contentBlockElements.add("h3"); 73 contentBlockElements.add("h4"); 74 contentBlockElements.add("h5"); 75 contentBlockElements.add("blockquote"); 76 } 77 private static Set needsCleanupOfEndBrs; 78 static { 79 needsCleanupOfEndBrs = new HashSet(); 80 needsCleanupOfEndBrs.add("th"); 81 needsCleanupOfEndBrs.add("td"); 82 needsCleanupOfEndBrs.add("li"); 83 } 84 85 public HtmlRepairer(HtmlCleanerTemplate template) { 86 this.template = template; 87 } 88 89 95 public void clean(SaxBuffer buffer, ContentHandler contentHandler) throws SAXException { 96 Cleaner cleaner = new Cleaner(buffer, contentHandler); 97 cleaner.clean(); 98 } 99 100 private class Cleaner { 101 private ContentHandler finalContentHandler; 102 private SaxBuffer input; 103 104 private ContentHandler contentHandler; 105 private ArrayList openElements; 106 107 public Cleaner(SaxBuffer input, ContentHandler contentHandler) { 108 this.input = input; 109 this.finalContentHandler = contentHandler; 110 } 111 112 private void clean() throws SAXException { 113 contentHandler = new SaxBuffer(); 117 this.openElements = new ArrayList(); 118 119 elementCleanup(input.getBits()); 120 121 SaxBuffer elementCleanupOutput = (SaxBuffer)contentHandler; 122 this.contentHandler = new SaxBuffer(); 123 this.openElements = new ArrayList(); 124 125 introduceParas(elementCleanupOutput.getBits()); 126 127 SaxBuffer introduceParasOutput = (SaxBuffer)contentHandler; 128 this.contentHandler = new SaxBuffer(); 129 this.openElements = new ArrayList(); 130 131 structuralCleanup(introduceParasOutput.getBits()); 132 133 SaxBuffer structuralCleanupOutput = (SaxBuffer)contentHandler; 134 this.contentHandler = new SaxBuffer(); 135 this.openElements = new ArrayList(); 136 137 cleanupBrsAndEmptyContentBlocks(structuralCleanupOutput.getBits()); 138 139 SaxBuffer contentBlockCleanupOutput = (SaxBuffer)contentHandler; 140 this.contentHandler = new SaxBuffer(); 141 this.openElements = new ArrayList(); 142 143 cleanupWipeableEmptyElements(contentBlockCleanupOutput.getBits()); 144 145 SaxBuffer inlineCleanupOutput = (SaxBuffer)contentHandler; 146 this.contentHandler = new SaxBuffer(); 147 this.openElements = new ArrayList(); 148 149 cleanupBrsAndEmptyContentBlocks(inlineCleanupOutput.getBits()); 151 152 SaxBuffer secondContentBlockCleanupOutput = (SaxBuffer)contentHandler; 153 this.contentHandler = finalContentHandler; 154 this.openElements = new ArrayList(); 155 156 translateBeeaarsInPees(secondContentBlockCleanupOutput.getBits()); 157 } 158 159 167 private void elementCleanup(List bits) throws SAXException { 168 Stack endElements = new Stack(); 169 boolean preSupported = template.descriptors.containsKey("pre"); 170 171 int i = 0; 172 while (i < bits.size()) { 173 Object bit = bits.get(i); 174 if (bit instanceof SaxBuffer.StartElement) { 175 SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit; 176 if (!startElement.namespaceURI.equals("")) { 177 endElements.add(new EndElementInfo()); 179 } else { 180 if (startElement.localName.equals("span")) { 181 String classAttr = startElement.attrs.getValue("class"); 185 if (classAttr != null) { 186 if (template.allowedSpanClasses.contains(classAttr)) { 187 AttributesImpl attrs = new AttributesImpl (); 189 attrs.addAttribute("", "class", "class", "CDATA", classAttr); 190 startElement("span", attrs); 191 endElements.push(new EndElementInfo("span")); 192 } else { 193 endElements.push(new EndElementInfo()); 195 } 196 } else { 197 String styleAttr = startElement.attrs.getValue("style"); 198 if (styleAttr != null) { 199 StringTokenizer styleAttrTokenizer = new StringTokenizer(styleAttr, ";"); 200 boolean hasBold = false; 201 boolean hasItalic = false; 202 while (styleAttrTokenizer.hasMoreTokens()) { 203 String styleToken = styleAttrTokenizer.nextToken(); 204 int colonPos = styleToken.indexOf(':'); 205 if (colonPos != -1) { 206 String name = styleToken.substring(0, colonPos).trim().toLowerCase(); 207 String value = styleToken.substring(colonPos + 1).trim().toLowerCase(); 208 if (name.equals("font-weight") && value.equals("bold")) { 209 hasBold = true; 210 } else if (name.equals("font-style") && value.equals("italic")) { 211 hasItalic = true; 212 } 213 } 214 } 215 216 MultiEndElementInfo endElement = new MultiEndElementInfo(); 217 if (hasBold) { 218 startElement("strong", new AttributesImpl ()); 219 endElement.add(new EndElementInfo("strong")); 220 } 221 if (hasItalic) { 222 startElement("em", new AttributesImpl ()); 223 endElement.add(new EndElementInfo("em")); 224 } 225 endElements.push(endElement); 226 } else { 227 endElements.push(new EndElementInfo()); 228 } 229 } 230 } else if (startElement.localName.equals("div")) { 231 String classAttr = startElement.attrs.getValue("class"); 232 if (classAttr != null && template.allowedDivClasses.contains(classAttr)) { 233 AttributesImpl attrs = new AttributesImpl (); 234 attrs.addAttribute("", "class", "class", "CDATA", classAttr); 235 startElement("div", attrs); 236 endElements.push(new EndElementInfo("div")); 237 } else { 238 endElements.push(new EndElementInfo()); 240 } 241 } else if (startElement.localName.equals("p")) { 242 String classAttr = startElement.attrs.getValue("class"); 243 if (classAttr != null && template.allowedParaClasses.contains(classAttr)) { 244 startElement("p", getAllowedAttributes(startElement)); 245 endElements.push(new EndElementInfo("p")); 246 } else { 247 AttributesImpl attrs = getAllowedAttributes(startElement); 248 int classPos = attrs.getIndex("class"); 249 if (classPos != -1) 250 attrs.removeAttribute(classPos); 251 startElement("p", attrs); 252 endElements.push(new EndElementInfo("p")); 253 } 254 } else if (startElement.localName.equals("pre") && preSupported) { 255 String classAttr = startElement.attrs.getValue("class"); 256 if (classAttr != null && template.allowedPreClasses.contains(classAttr)) { 257 startElement("pre", getAllowedAttributes(startElement)); 258 endElements.push(new EndElementInfo("pre")); 259 } else { 260 AttributesImpl attrs = getAllowedAttributes(startElement); 261 int classPos = attrs.getIndex("class"); 262 if (classPos != -1) 263 attrs.removeAttribute(classPos); 264 startElement("pre", attrs); 265 endElements.push(new EndElementInfo("pre")); 266 } 267 } else if (startElement.localName.equals("b")) { 268 startElement("strong", new AttributesImpl ()); 270 endElements.push(new EndElementInfo("strong")); 271 } else if (startElement.localName.equals("i")) { 272 startElement("em", new AttributesImpl ()); 274 endElements.push(new EndElementInfo("em")); 275 } else if (startElement.localName.equals("strike")) { 276 startElement("del", new AttributesImpl ()); 278 endElements.push(new EndElementInfo("del")); 279 } else if (startElement.localName.equals("html")) { 280 if (openElements.size() != 0) 281 throw new SAXException ("html element can only appear as root element."); 282 283 startElement(startElement.localName, new AttributesImpl ()); 284 endElements.push(new EndElementInfo(startElement.localName)); 285 286 while (true) { 288 i++; 289 if (i >= bits.size()) 290 throw new SAXException ("Reached end of input without encountering opening body tag."); 291 292 Object nextBit = bits.get(i); 293 if (nextBit instanceof SaxBuffer.StartElement && ((SaxBuffer.StartElement)nextBit).localName.equals("body") 294 && ((SaxBuffer.StartElement)nextBit).namespaceURI.equals("")) { 295 i--; 296 break; 297 } 298 } 299 300 } else if (startElement.localName.equals("body")) { 301 if (openElements.size() != 1) 302 throw new SAXException ("body element can only appear as child of html element"); 303 304 if (!((StartElementInfo)openElements.get(0)).getName().equals("html")) 305 throw new SAXException ("body element can only appear as child of html element"); 306 307 startElement("body", new AttributesImpl ()); 308 endElements.push(new EndElementInfo("body")); 309 } else if (startElement.localName.equals("img") && template.descriptors.containsKey("img")) { 310 AttributesImpl attrs = getAllowedAttributes(startElement); 311 if (template.imgAlternateSrcAttr != null) { 312 String altSrc = startElement.attrs.getValue(template.imgAlternateSrcAttr); 313 if (altSrc != null && !altSrc.equals("")) { 314 int hrefIndex = attrs.getIndex("src"); 315 if (hrefIndex != -1) 316 attrs.setValue(hrefIndex, altSrc); 317 else 318 attrs.addAttribute("", "src", "src", "CDATA", altSrc); 319 } 320 } 321 startElement(startElement.localName, attrs); 322 endElements.push(new EndElementInfo(startElement.localName)); 323 } else if (startElement.localName.equals("a") && template.descriptors.containsKey("a")) { 324 AttributesImpl attrs = getAllowedAttributes(startElement); 325 if (template.linkAlternateHrefAttr != null) { 326 String altHref = startElement.attrs.getValue(template.linkAlternateHrefAttr); 327 if (altHref != null && !altHref.equals("")) { 328 int hrefIndex = attrs.getIndex("href"); 329 if (hrefIndex != -1) 330 attrs.setValue(hrefIndex, altHref); 331 else 332 attrs.addAttribute("", "href", "href", "CDATA", altHref); 333 } 334 } 335 startElement(startElement.localName, attrs); 336 endElements.push(new EndElementInfo(startElement.localName)); 337 } else if (startElement.localName.equals("td") || startElement.localName.equals("th")) { 338 AttributesImpl attrs = getAllowedAttributes(startElement); 339 340 String rowspan = attrs.getValue("rowspan"); 342 if (rowspan != null && rowspan.equals("1")) { 343 attrs.removeAttribute(attrs.getIndex("rowspan")); 344 } 345 String colspan = attrs.getValue("colspan"); 346 if (colspan != null && colspan.equals("1")) { 347 attrs.removeAttribute(attrs.getIndex("colspan")); 348 } 349 350 startElement(startElement.localName, attrs); 351 endElements.push(new EndElementInfo(startElement.localName)); 352 } else if (template.descriptors.containsKey(startElement.localName)) { 353 startElement(startElement.localName, getAllowedAttributes(startElement)); 354 endElements.push(new EndElementInfo(startElement.localName)); 355 } else { 356 endElements.push(new EndElementInfo()); 358 } 359 } 360 } else if (bit instanceof SaxBuffer.EndElement) { 361 XMLizable endElement = (XMLizable)endElements.pop(); 362 endElement.toSAX(contentHandler); 363 } else if (bit instanceof SaxBuffer.Characters) { 364 ((SaxBuffer.Characters)bit).send(contentHandler); 365 } else if (bit instanceof SaxBuffer.StartDocument) { 366 contentHandler.startDocument(); 367 } else if (bit instanceof SaxBuffer.EndDocument) { 368 contentHandler.endDocument(); 369 return; 371 } 372 i++; 373 } 374 } 375 376 private AttributesImpl getAllowedAttributes(SaxBuffer.StartElement startElement) { 377 String [] allowedAttributes = ((ElementDescriptor)template.descriptors.get(startElement.localName)).getAttributeNames(); 379 AttributesImpl attrs = new AttributesImpl (); 380 for (int k = 0; k < allowedAttributes.length; k++) { 381 String value = startElement.attrs.getValue(allowedAttributes[k]); 382 if (value != null) { 383 attrs.addAttribute("", allowedAttributes[k], allowedAttributes[k], "CDATA", value); 384 } 385 } 386 return attrs; 387 } 388 389 393 private void introduceParas(List bits) throws SAXException { 394 Stack endElements = new Stack(); 395 IntStack introducedParas = new IntStack(); 396 ElementDescriptor bodyDescriptor = (ElementDescriptor)template.descriptors.get("body"); 397 ElementDescriptor tdDescriptor = (ElementDescriptor)template.descriptors.get("td"); 398 ElementDescriptor thDescriptor = (ElementDescriptor)template.descriptors.get("th"); 399 ElementDescriptor paraDescriptor = (ElementDescriptor)template.descriptors.get("p"); 400 401 int i = -1; 402 while (i < bits.size()) { 403 i++; 404 Object bit = bits.get(i); 405 if (bit instanceof SaxBuffer.StartElement) { 406 SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit; 407 408 if (!introducedParas.empty() && introducedParas.peek() == 0 && !paraDescriptor.childAllowed(startElement.localName)) { 409 endElement("p"); 410 introducedParas.pop(); 411 } else if (openElements.size() > 1) { 412 StartElementInfo parentInfo = (StartElementInfo)openElements.get(openElements.size() - 1); 413 String parentName = parentInfo.getName(); 414 boolean startPara = 415 (parentName.equals("body") && !bodyDescriptor.childAllowed(startElement.localName) 416 && paraDescriptor.childAllowed(startElement.localName)) 417 || 418 (parentName.equals("td") && !tdDescriptor.childAllowed(startElement.localName) 419 && paraDescriptor.childAllowed(startElement.localName)) 420 || 421 (parentName.equals("th") && !thDescriptor.childAllowed(startElement.localName) 422 && paraDescriptor.childAllowed(startElement.localName)); 423 424 if (startPara) { 425 startElement("p", new AttributesImpl ()); 426 introducedParas.push(0); 427 } 428 } 429 430 if (!introducedParas.empty()) { 431 introducedParas.push(introducedParas.pop() + 1); 432 } 433 434 startElement(startElement.localName, startElement.attrs); 435 endElements.push(new EndElementInfo(startElement.localName)); 436 437 438 } else if (bit instanceof SaxBuffer.EndElement) { 439 if (!introducedParas.empty() && introducedParas.peek() == 0) { 440 endElement("p"); 441 introducedParas.pop(); 442 } 443 444 XMLizable endElement = (XMLizable)endElements.pop(); 445 endElement.toSAX(contentHandler); 446 447 if (!introducedParas.empty()) { 448 introducedParas.push(introducedParas.pop() - 1); 449 } 450 } else if (bit instanceof SaxBuffer.Characters) { 451 if (openElements.size() > 1) { 452 StartElementInfo parentInfo = (StartElementInfo)openElements.get(openElements.size() - 1); 453 String parentName = parentInfo.getName(); 454 boolean startPara = parentName.equals("body") || parentName.equals("td") || parentName.equals("th") || parentName.equals("blockquote"); 455 if (startPara) { 456 startElement("p", new AttributesImpl ()); 457 introducedParas.push(0); 458 } 459 } 460 ((SaxBuffer.Characters)bit).send(contentHandler); 461 } else if (bit instanceof SaxBuffer.StartDocument) { 462 contentHandler.startDocument(); 463 } else if (bit instanceof SaxBuffer.EndDocument) { 464 contentHandler.endDocument(); 465 return; 467 } 468 } 469 } 470 471 475 private void structuralCleanup(List bits) throws SAXException { 476 Stack endElements = new Stack(); 477 478 int i = -1; 479 while (i < bits.size()) { 480 i++; 481 Object bit = bits.get(i); 482 if (bit instanceof SaxBuffer.StartElement) { 483 SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit; 484 485 ElementDescriptor descriptor = (ElementDescriptor)template.descriptors.get(startElement.localName); 486 if (descriptor == null) 487 throw new SAXException ("Missing ElementDescriptor for tagname " + startElement.localName); 488 489 if (openElements.size() > 0) { 491 String parentElementName = ((StartElementInfo)openElements.get(openElements.size() - 1)).getName(); 492 ElementDescriptor parentDescriptor = (ElementDescriptor)template.descriptors.get(parentElementName); 493 494 boolean allowed = parentDescriptor.childAllowed(startElement.localName); 495 496 if (allowed) { 498 startElement(startElement.localName, startElement.attrs); 499 endElements.push(new EndElementInfo(startElement.localName)); 500 continue; 501 } 502 503 int firstGoodAncestor = -1; 505 for (int k = openElements.size() - 2; k >= 0; k--) { 506 String ancestorElementName = ((StartElementInfo)openElements.get(k)).getName(); 507 ElementDescriptor ancestorDescriptor = (ElementDescriptor)template.descriptors.get(ancestorElementName); 508 if (ancestorDescriptor.childAllowed(startElement.localName)) { 509 firstGoodAncestor = k; 510 break; 511 } 512 } 513 514 if (firstGoodAncestor == -1) 515 throw new SAXException ("Element \"" + startElement.localName + "\" is disallowed at its current location, and could not automatically fix this."); 516 517 MultiEndElementInfo endElementInfo = new MultiEndElementInfo(); 519 for (int k = openElements.size() - 1; k > firstGoodAncestor; k--) { 520 endElementInfo.add((StartElementInfo)openElements.get(k)); 521 } 522 endElementInfo.add(new EndElementInfo(startElement.localName)); 523 524 for (int k = openElements.size() - 1; k > firstGoodAncestor; k--) { 525 endElement(((StartElementInfo)openElements.get(k)).getName()); 526 } 527 528 startElement(startElement.localName, startElement.attrs); 529 endElements.push(endElementInfo); 530 531 } else { 532 startElement(startElement.localName, startElement.attrs); 533 endElements.push(new EndElementInfo(startElement.localName)); 534 } 535 536 } else if (bit instanceof SaxBuffer.EndElement) { 537 XMLizable endElement = (XMLizable)endElements.pop(); 538 endElement.toSAX(contentHandler); 539 } else if (bit instanceof SaxBuffer.Characters) { 540 ((SaxBuffer.Characters)bit).send(contentHandler); 541 } else if (bit instanceof SaxBuffer.StartDocument) { 542 contentHandler.startDocument(); 543 } else if (bit instanceof SaxBuffer.EndDocument) { 544 contentHandler.endDocument(); 545 return; 547 } 548 } 549 } 550 551 556 private void cleanupBrsAndEmptyContentBlocks(List bits) throws SAXException { 557 Stack endElements = new Stack(); 558 559 int i = -1; 560 while (i < bits.size()) { 561 i++; 562 Object bit = bits.get(i); 563 if (bit instanceof SaxBuffer.StartElement) { 564 SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit; 565 566 boolean contentBlockElement = contentBlockElements.contains(startElement.localName); 567 if (contentBlockElement || startElement.localName.equals("td") || startElement.localName.equals("th")) { 568 int elementNesting = 0; 570 int z = i; 571 boolean reachedEndElement = false; 572 while (true) { 573 z++; 574 Object bit2 = bits.get(z); 575 if (bit2 instanceof SaxBuffer.Characters && isWhitespace((SaxBuffer.Characters)bit2)) { 576 } else if (bit2 instanceof SaxBuffer.StartElement 578 && ((SaxBuffer.StartElement)bit2).localName.equals("br")) { 579 elementNesting++; 580 } else if (bit2 instanceof SaxBuffer.EndElement 581 && ((SaxBuffer.EndElement)bit2).localName.equals("br")) { 582 elementNesting--; 583 } else if (bit2 instanceof SaxBuffer.EndElement && elementNesting == 0) { 584 reachedEndElement = true; 585 break; 586 } else { 587 break; 588 } 589 } 590 591 if (reachedEndElement) { 592 if (contentBlockElement) { 593 i = z; 595 continue; 596 } else { 597 startElement(startElement.localName, startElement.attrs); 598 endElements.push(new EndElementInfo(startElement.localName)); 599 i = z - 1; 601 continue; 602 } 603 } else { 604 if (contentBlockElement) { 605 i = z - 1; 607 } else { 608 } 610 } 611 612 } else if (startElement.localName.equals("br")) { 613 int firstContentBlockAncestor = -1; 615 for (int k = openElements.size() - 1; k >= 0; k--) { 616 StartElementInfo startElementInfo = (StartElementInfo)openElements.get(k); 617 if (contentBlockElements.contains(startElementInfo.getName())) { 618 firstContentBlockAncestor = k; 619 break; 620 } 621 } 622 623 if (firstContentBlockAncestor != -1) { 625 int z = i; 627 int brCount = 1; 628 boolean continueSearch = true; 629 while (continueSearch) { 630 z++; 631 Object bit2 = bits.get(z); 632 if (bit2 instanceof SaxBuffer.EndElement) { 633 String name = ((SaxBuffer.EndElement)bit2).localName; 634 if (!name.equals("br")) { 635 continueSearch = false; 636 } 637 } else if (bit2 instanceof SaxBuffer.StartElement 638 && ((SaxBuffer.StartElement)bit2).localName.equals("br")) { 639 brCount++; 640 continueSearch = true; 641 } else if (bit2 instanceof SaxBuffer.Characters && isWhitespace((SaxBuffer.Characters)bit2)) { 642 continueSearch = true; 643 } else { 644 continueSearch = false; 645 } 646 } 647 648 boolean beforeEndContentBlock = false; 651 for (int t = z; t < bits.size(); t++) { 652 if (bits.get(t) instanceof SaxBuffer.EndElement) { 653 SaxBuffer.EndElement endEl = (SaxBuffer.EndElement)bits.get(t); 654 if (contentBlockElements.contains(endEl.localName)) { 655 beforeEndContentBlock = true; 656 break; 657 } 658 } else if (bits.get(t) instanceof SaxBuffer.Characters && isWhitespace((SaxBuffer.Characters)bits.get(t))) { 660 } else { 662 break; 664 } 665 } 666 if (beforeEndContentBlock) { 667 i = z - 1; 668 continue; 669 } 670 671 if (brCount >= 2) { 672 i = z - 1; 675 List elementsToRestart = new ArrayList(); 676 for (int k = openElements.size() - 1; k >= firstContentBlockAncestor; k--) { 677 elementsToRestart.add(openElements.get(k)); 678 } 679 680 for (int k = openElements.size() - 1; k >= firstContentBlockAncestor; k--) { 681 endElement(((StartElementInfo)openElements.get(k)).getName()); 682 } 683 684 for (int k = elementsToRestart.size() - 1; k >= 0; k--) { 685 StartElementInfo startElementInfo = (StartElementInfo)elementsToRestart.get(k); 686 startElement(startElementInfo.getName(), startElementInfo.getAttrs()); 687 } 688 continue; 689 } 690 } else if (startElement.localName.equals("br") && openElements.size() > 1 691 && needsCleanupOfEndBrs.contains(((StartElementInfo)openElements.get(openElements.size() - 1)).getName())) { 692 String elementName = ((StartElementInfo)openElements.get(openElements.size() - 1)).getName(); 694 695 boolean nextIsEndOfElement = false; 696 int r = i + 1; 697 for (; r < bits.size(); r++) { 698 Object nextBit = bits.get(r); 699 if (nextBit instanceof SaxBuffer.EndElement) { 700 SaxBuffer.EndElement endEl = (SaxBuffer.EndElement)nextBit; 701 if (endEl.localName.equals("br")) { 702 continue; 703 } else if (endEl.localName.equals(elementName)) { 704 nextIsEndOfElement = true; 705 break; 706 } else { 707 break; 708 } 709 } else if (nextBit instanceof SaxBuffer.Characters && isWhitespace((SaxBuffer.Characters)nextBit)) { 710 } else { 712 break; 713 } 714 } 715 716 if (nextIsEndOfElement) { 717 i = r - 1; 718 continue; 719 } 720 } 721 } 722 723 startElement(startElement.localName, startElement.attrs); 724 endElements.push(new EndElementInfo(startElement.localName)); 725 726 727 } else if (bit instanceof SaxBuffer.EndElement) { 728 XMLizable endElement = (XMLizable)endElements.pop(); 729 endElement.toSAX(contentHandler); 730 } else if (bit instanceof SaxBuffer.Characters) { 731 ((SaxBuffer.Characters)bit).send(contentHandler); 732 } else if (bit instanceof SaxBuffer.StartDocument) { 733 contentHandler.startDocument(); 734 } else if (bit instanceof SaxBuffer.EndDocument) { 735 contentHandler.endDocument(); 736 return; 738 } 739 } 740 } 741 742 private void cleanupWipeableEmptyElements(List bits) throws SAXException { 743 Stack endElements = new Stack(); 744 745 int i = -1; 746 while (i < bits.size()) { 747 i++; 748 Object bit = bits.get(i); 749 if (bit instanceof SaxBuffer.StartElement) { 750 SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit; 751 if (wipeableEmptyElements.contains(startElement.localName)) { 752 boolean hasWhitespace = false; 753 boolean reachedEndElement = false; 754 int elementNesting = 0; 755 int k = i; 756 while (true) { 757 k++; 758 Object nextBit = bits.get(k); 759 if (nextBit instanceof SaxBuffer.StartElement && wipeableEmptyElements.contains(((SaxBuffer.StartElement)nextBit).localName)) { 760 elementNesting++; 761 } else if (nextBit instanceof SaxBuffer.Characters && isWhitespace((SaxBuffer.Characters)nextBit)) { 762 hasWhitespace = true; 763 } else if (nextBit instanceof SaxBuffer.EndElement && elementNesting > 0) { 764 elementNesting--; 765 } else if (nextBit instanceof SaxBuffer.EndElement && elementNesting == 0) { 766 reachedEndElement = true; 767 break; 768 } else { 769 break; 770 } 771 } 772 773 if (reachedEndElement) { 774 i = k; 776 if (hasWhitespace) 778 contentHandler.characters(new char[] { ' ' }, 0, 1); 779 continue; 780 } 781 } 782 783 startElement(startElement.localName, startElement.attrs); 784 endElements.push(new EndElementInfo(startElement.localName)); 785 786 } else if (bit instanceof SaxBuffer.EndElement) { 787 XMLizable endElement = (XMLizable)endElements.pop(); 788 endElement.toSAX(contentHandler); 789 } else if (bit instanceof SaxBuffer.Characters) { 790 ((SaxBuffer.Characters)bit).send(contentHandler); 791 } else if (bit instanceof SaxBuffer.StartDocument) { 792 contentHandler.startDocument(); 793 } else if (bit instanceof SaxBuffer.EndDocument) { 794 contentHandler.endDocument(); 795 return; 797 } 798 } 799 } 800 801 804 private void translateBeeaarsInPees(List bits) throws SAXException { 805 int preLevel = 0; 806 int i = -1; 807 while (i < bits.size()) { 808 i++; 809 Object bit = bits.get(i); 810 if (bit instanceof SaxBuffer.StartElement) { 811 SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit; 812 if (startElement.localName.equals("pre")) { 813 preLevel++; 814 } else if (preLevel > 0 && startElement.localName.equals("br")) { 815 Object nextBit = bits.get(i + 1); 818 if (nextBit instanceof SaxBuffer.EndElement && ((SaxBuffer.EndElement)nextBit).localName.equals("br")) { 819 contentHandler.characters(NEWLINE, 0, 1); 821 i++; 822 continue; 823 } 824 } 825 826 startElement(startElement.localName, startElement.attrs); 827 828 } else if (bit instanceof SaxBuffer.EndElement) { 829 SaxBuffer.EndElement endElement = (SaxBuffer.EndElement)bit; 830 if (endElement.localName.equals("pre")) { 831 preLevel--; 832 } 833 contentHandler.endElement(endElement.namespaceURI, endElement.localName, endElement.qName); 834 } else if (bit instanceof SaxBuffer.Characters) { 835 ((SaxBuffer.Characters)bit).send(contentHandler); 836 } else if (bit instanceof SaxBuffer.StartDocument) { 837 contentHandler.startDocument(); 838 } else if (bit instanceof SaxBuffer.EndDocument) { 839 contentHandler.endDocument(); 840 return; 842 } 843 } 844 } 845 846 private boolean isWhitespace(SaxBuffer.Characters characters) { 847 for (int i = 0; i < characters.ch.length; i++) { 848 if (!(Character.isWhitespace(characters.ch[i]) || characters.ch[i] == (char)160)) return false; 850 } 851 return true; 852 } 853 854 private void startElement(String name, Attributes attrs) throws SAXException { 855 contentHandler.startElement("", name, name, attrs); 856 openElements.add(new StartElementInfo(name, attrs)); 857 } 858 859 private void endElement(String name) throws SAXException { 860 contentHandler.endElement("", name, name); 861 String removed = ((StartElementInfo)openElements.remove(openElements.size() - 1)).getName(); 862 if (!removed.equals(name)) { 863 throw new SAXException ("The close tag \"" + name + "\" did not match the open tag \"" + removed + "\"."); 864 } 865 } 866 867 private class StartElementInfo implements XMLizable { 868 private final String name; 869 private final Attributes attrs; 870 871 public StartElementInfo(String name, Attributes attrs) { 872 this.name = name; 873 this.attrs = attrs; 874 } 875 876 public String getName() { 877 return name; 878 } 879 880 public Attributes getAttrs() { 881 return attrs; 882 } 883 884 public void toSAX(ContentHandler contentHandler) throws SAXException { 885 startElement(name, attrs); 886 } 887 } 888 889 private class EndElementInfo implements XMLizable { 890 private final boolean skip; 891 private final String localName; 892 893 public EndElementInfo() { 894 this.skip = true; 895 this.localName = null; 896 } 897 898 public EndElementInfo(String localName) { 899 this.skip = false; 900 this.localName = localName; 901 } 902 903 public void toSAX(ContentHandler contentHandler) throws SAXException { 904 if (!skip) { 905 endElement(localName); 906 } 907 } 908 } 909 910 private final class MultiEndElementInfo implements XMLizable { 911 private ArrayList tags = new ArrayList(2); 912 913 public void add(EndElementInfo endElement) { 914 this.tags.add(endElement); 915 } 916 917 public void add(StartElementInfo endElement) { 918 this.tags.add(endElement); 919 } 920 921 public void toSAX(ContentHandler contentHandler) throws SAXException { 922 for (int i = tags.size() - 1; i >= 0; i--) { 923 XMLizable tag = (XMLizable)tags.get(i); 924 tag.toSAX(contentHandler); 925 } 926 } 927 } 928 929 } 930 931 interface XMLizable { 932 public void toSAX(ContentHandler contentHandler) throws SAXException ; 933 } 934 935 static class IntStack { 936 private Stack stack = new Stack(); 937 938 public void push(int value) { 939 stack.push(new Integer (value)); 940 } 941 942 public int pop() { 943 return ((Integer )stack.pop()).intValue(); 944 } 945 946 public boolean empty() { 947 return stack.empty(); 948 } 949 950 public int peek() { 951 return ((Integer )stack.peek()).intValue(); 952 } 953 } 954 } 955 | Popular Tags |