KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > htmlcleaner > HtmlRepairer


1 /*
2  * Copyright 2004 Outerthought bvba and Schaubroeck nv
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.outerj.daisy.htmlcleaner;
17
18 import org.xml.sax.SAXException JavaDoc;
19 import org.xml.sax.Attributes JavaDoc;
20 import org.xml.sax.ContentHandler JavaDoc;
21 import org.xml.sax.helpers.AttributesImpl JavaDoc;
22 import org.outerj.daisy.xmlutil.SaxBuffer;
23
24 import java.util.*;
25
26 /**
27  * Works on HTML input to clean it out to a
28  * limited subset of HTML, mostly focussing on structural/semantic
29  * elements (actually, what should be kept is configurable).
30  *
31  * <p>The input events should be in no namespace and contain
32  * the html and body tags. All elements and attributes should already
33  * be lowercased.</p>
34  *
35  * <p>All elements and attributes that are not explicitely allowed
36  * in the configuration will be dropped (but
37  * their character content will remain).</p>
38  *
39  * <p>Span and div elements are treated specially. The will only be
40  * kept if their class attribute has one of the allowed values, specified
41  * in the configuration of this component. Span elements that contain
42  * a style attribute specifying bold and/or italic styling will
43  * be converted to the equivalent strong/em tags.</p>
44  *
45  */

46 class HtmlRepairer {
47     private HtmlCleanerTemplate template;
48     /**
49      * Hardcoded set of elements that can be removed if they contain
50      * no character data or only other wipeable elements. Usually used
51      * for inline elements.
52      */

53     private static Set wipeableEmptyElements;
54     static {
55         wipeableEmptyElements = new HashSet();
56         wipeableEmptyElements.add("strong");
57         wipeableEmptyElements.add("em");
58         wipeableEmptyElements.add("sub");
59         wipeableEmptyElements.add("sup");
60         wipeableEmptyElements.add("a");
61         wipeableEmptyElements.add("tt");
62         wipeableEmptyElements.add("ul");
63         wipeableEmptyElements.add("del");
64     }
65     private static final char[] NEWLINE = new char[] { '\n' };
66     private static Set contentBlockElements;
67     static {
68         contentBlockElements = new HashSet();
69         contentBlockElements.add("p");
70         contentBlockElements.add("h1");
71         contentBlockElements.add("h2");
72         contentBlockElements.add("h3");
73         contentBlockElements.add("h4");
74         contentBlockElements.add("h5");
75         contentBlockElements.add("blockquote");
76     }
77     private static Set needsCleanupOfEndBrs;
78     static {
79         needsCleanupOfEndBrs = new HashSet();
80         needsCleanupOfEndBrs.add("th");
81         needsCleanupOfEndBrs.add("td");
82         needsCleanupOfEndBrs.add("li");
83     }
84
85     public HtmlRepairer(HtmlCleanerTemplate template) {
86         this.template = template;
87     }
88
89     /**
90      * Cleans the HTML stored in the SaxBuffer.
91      *
92      * @param buffer should only contain following types of events: start/endElement, start/endDocument, characters
93      * @param contentHandler where the outcome will be send to
94      */

95     public void clean(SaxBuffer buffer, ContentHandler JavaDoc contentHandler) throws SAXException JavaDoc {
96         Cleaner cleaner = new Cleaner(buffer, contentHandler);
97         cleaner.clean();
98     }
99
100     private class Cleaner {
101         private ContentHandler JavaDoc finalContentHandler;
102         private SaxBuffer input;
103
104         private ContentHandler JavaDoc contentHandler;
105         private ArrayList openElements;
106
107         public Cleaner(SaxBuffer input, ContentHandler JavaDoc contentHandler) {
108             this.input = input;
109             this.finalContentHandler = contentHandler;
110         }
111
112         private void clean() throws SAXException JavaDoc {
113             // cleaning happens in multiple stages to make the logic simpler.
114
// The different stages are implemented in pull-style, by reading
115
// events from a SaxBuffer instance.
116
contentHandler = new SaxBuffer();
117             this.openElements = new ArrayList();
118
119             elementCleanup(input.getBits());
120
121             SaxBuffer elementCleanupOutput = (SaxBuffer)contentHandler;
122             this.contentHandler = new SaxBuffer();
123             this.openElements = new ArrayList();
124
125             introduceParas(elementCleanupOutput.getBits());
126
127             SaxBuffer introduceParasOutput = (SaxBuffer)contentHandler;
128             this.contentHandler = new SaxBuffer();
129             this.openElements = new ArrayList();
130
131             structuralCleanup(introduceParasOutput.getBits());
132
133             SaxBuffer structuralCleanupOutput = (SaxBuffer)contentHandler;
134             this.contentHandler = new SaxBuffer();
135             this.openElements = new ArrayList();
136
137             cleanupBrsAndEmptyContentBlocks(structuralCleanupOutput.getBits());
138
139             SaxBuffer contentBlockCleanupOutput = (SaxBuffer)contentHandler;
140             this.contentHandler = new SaxBuffer();
141             this.openElements = new ArrayList();
142
143             cleanupWipeableEmptyElements(contentBlockCleanupOutput.getBits());
144
145             SaxBuffer inlineCleanupOutput = (SaxBuffer)contentHandler;
146             this.contentHandler = new SaxBuffer();
147             this.openElements = new ArrayList();
148
149             // do content block cleanup a second time, since cleanup of empty inline elements might have left empty content blocks
150
cleanupBrsAndEmptyContentBlocks(inlineCleanupOutput.getBits());
151
152             SaxBuffer secondContentBlockCleanupOutput = (SaxBuffer)contentHandler;
153             this.contentHandler = finalContentHandler;
154             this.openElements = new ArrayList();
155
156             translateBeeaarsInPees(secondContentBlockCleanupOutput.getBits());
157         }
158
159         /**
160          * <ul>
161          * <li>Makes sure all content is contained inside html/body
162          * <li>Drops unallowed elements
163          * <li>Does element translations (ie b into strong)
164          * <li>Only outputs non-namespaced elements
165          * </ul>
166          */

167         private void elementCleanup(List bits) throws SAXException JavaDoc {
168             Stack endElements = new Stack();
169             boolean preSupported = template.descriptors.containsKey("pre");
170
171             int i = 0;
172             while (i < bits.size()) {
173                 Object JavaDoc bit = bits.get(i);
174                 if (bit instanceof SaxBuffer.StartElement) {
175                     SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit;
176                     if (!startElement.namespaceURI.equals("")) {
177                         // namespaced elements are dropped
178
endElements.add(new EndElementInfo());
179                     } else {
180                         if (startElement.localName.equals("span")) {
181                             // two possibilities:
182
// * has only class attribute with recognized value
183
// * has style with certain recognized effects (bold/italic) -> translate to semantic correct tag.
184
String JavaDoc classAttr = startElement.attrs.getValue("class");
185                             if (classAttr != null) {
186                                 if (template.allowedSpanClasses.contains(classAttr)) {
187                                     // make new attributes element to make sure there are no other attributes on the element
188
AttributesImpl JavaDoc attrs = new AttributesImpl JavaDoc();
189                                     attrs.addAttribute("", "class", "class", "CDATA", classAttr);
190                                     startElement("span", attrs);
191                                     endElements.push(new EndElementInfo("span"));
192                                 } else {
193                                     // span element is dropped
194
endElements.push(new EndElementInfo());
195                                 }
196                             } else {
197                                 String JavaDoc styleAttr = startElement.attrs.getValue("style");
198                                 if (styleAttr != null) {
199                                     StringTokenizer styleAttrTokenizer = new StringTokenizer(styleAttr, ";");
200                                     boolean hasBold = false;
201                                     boolean hasItalic = false;
202                                     while (styleAttrTokenizer.hasMoreTokens()) {
203                                         String JavaDoc styleToken = styleAttrTokenizer.nextToken();
204                                         int colonPos = styleToken.indexOf(':');
205                                         if (colonPos != -1) {
206                                             String JavaDoc name = styleToken.substring(0, colonPos).trim().toLowerCase();
207                                             String JavaDoc value = styleToken.substring(colonPos + 1).trim().toLowerCase();
208                                             if (name.equals("font-weight") && value.equals("bold")) {
209                                                 hasBold = true;
210                                             } else if (name.equals("font-style") && value.equals("italic")) {
211                                                 hasItalic = true;
212                                             }
213                                         }
214                                     }
215
216                                     MultiEndElementInfo endElement = new MultiEndElementInfo();
217                                     if (hasBold) {
218                                         startElement("strong", new AttributesImpl JavaDoc());
219                                         endElement.add(new EndElementInfo("strong"));
220                                     }
221                                     if (hasItalic) {
222                                         startElement("em", new AttributesImpl JavaDoc());
223                                         endElement.add(new EndElementInfo("em"));
224                                     }
225                                     endElements.push(endElement);
226                                 } else {
227                                     endElements.push(new EndElementInfo());
228                                 }
229                             }
230                         } else if (startElement.localName.equals("div")) {
231                             String JavaDoc classAttr = startElement.attrs.getValue("class");
232                             if (classAttr != null && template.allowedDivClasses.contains(classAttr)) {
233                                 AttributesImpl JavaDoc attrs = new AttributesImpl JavaDoc();
234                                 attrs.addAttribute("", "class", "class", "CDATA", classAttr);
235                                 startElement("div", attrs);
236                                 endElements.push(new EndElementInfo("div"));
237                             } else {
238                                 // unallowed class, drop div element
239
endElements.push(new EndElementInfo());
240                             }
241                         } else if (startElement.localName.equals("p")) {
242                             String JavaDoc classAttr = startElement.attrs.getValue("class");
243                             if (classAttr != null && template.allowedParaClasses.contains(classAttr)) {
244                                 startElement("p", getAllowedAttributes(startElement));
245                                 endElements.push(new EndElementInfo("p"));
246                             } else {
247                                 AttributesImpl JavaDoc attrs = getAllowedAttributes(startElement);
248                                 int classPos = attrs.getIndex("class");
249                                 if (classPos != -1)
250                                     attrs.removeAttribute(classPos);
251                                 startElement("p", attrs);
252                                 endElements.push(new EndElementInfo("p"));
253                             }
254                         } else if (startElement.localName.equals("pre") && preSupported) {
255                             String JavaDoc classAttr = startElement.attrs.getValue("class");
256                             if (classAttr != null && template.allowedPreClasses.contains(classAttr)) {
257                                 startElement("pre", getAllowedAttributes(startElement));
258                                 endElements.push(new EndElementInfo("pre"));
259                             } else {
260                                 AttributesImpl JavaDoc attrs = getAllowedAttributes(startElement);
261                                 int classPos = attrs.getIndex("class");
262                                 if (classPos != -1)
263                                     attrs.removeAttribute(classPos);
264                                 startElement("pre", attrs);
265                                 endElements.push(new EndElementInfo("pre"));
266                             }
267                         } else if (startElement.localName.equals("b")) {
268                             // translate to <strong>
269
startElement("strong", new AttributesImpl JavaDoc());
270                             endElements.push(new EndElementInfo("strong"));
271                         } else if (startElement.localName.equals("i")) {
272                             // translate to <em>
273
startElement("em", new AttributesImpl JavaDoc());
274                             endElements.push(new EndElementInfo("em"));
275                         } else if (startElement.localName.equals("strike")) {
276                             // translate to <del>
277
startElement("del", new AttributesImpl JavaDoc());
278                             endElements.push(new EndElementInfo("del"));
279                         } else if (startElement.localName.equals("html")) {
280                             if (openElements.size() != 0)
281                                 throw new SAXException JavaDoc("html element can only appear as root element.");
282
283                             startElement(startElement.localName, new AttributesImpl JavaDoc());
284                             endElements.push(new EndElementInfo(startElement.localName));
285
286                             // fast forward to body element
287
while (true) {
288                                 i++;
289                                 if (i >= bits.size())
290                                     throw new SAXException JavaDoc("Reached end of input without encountering opening body tag.");
291
292                                 Object JavaDoc nextBit = bits.get(i);
293                                 if (nextBit instanceof SaxBuffer.StartElement && ((SaxBuffer.StartElement)nextBit).localName.equals("body")
294                                         && ((SaxBuffer.StartElement)nextBit).namespaceURI.equals("")) {
295                                     i--;
296                                     break;
297                                 }
298                             }
299
300                         } else if (startElement.localName.equals("body")) {
301                             if (openElements.size() != 1)
302                                 throw new SAXException JavaDoc("body element can only appear as child of html element");
303
304                             if (!((StartElementInfo)openElements.get(0)).getName().equals("html"))
305                                 throw new SAXException JavaDoc("body element can only appear as child of html element");
306
307                             startElement("body", new AttributesImpl JavaDoc());
308                             endElements.push(new EndElementInfo("body"));
309                         } else if (startElement.localName.equals("img") && template.descriptors.containsKey("img")) {
310                             AttributesImpl JavaDoc attrs = getAllowedAttributes(startElement);
311                             if (template.imgAlternateSrcAttr != null) {
312                                 String JavaDoc altSrc = startElement.attrs.getValue(template.imgAlternateSrcAttr);
313                                 if (altSrc != null && !altSrc.equals("")) {
314                                     int hrefIndex = attrs.getIndex("src");
315                                     if (hrefIndex != -1)
316                                         attrs.setValue(hrefIndex, altSrc);
317                                     else
318                                         attrs.addAttribute("", "src", "src", "CDATA", altSrc);
319                                 }
320                             }
321                             startElement(startElement.localName, attrs);
322                             endElements.push(new EndElementInfo(startElement.localName));
323                         } else if (startElement.localName.equals("a") && template.descriptors.containsKey("a")) {
324                             AttributesImpl JavaDoc attrs = getAllowedAttributes(startElement);
325                             if (template.linkAlternateHrefAttr != null) {
326                                 String JavaDoc altHref = startElement.attrs.getValue(template.linkAlternateHrefAttr);
327                                 if (altHref != null && !altHref.equals("")) {
328                                     int hrefIndex = attrs.getIndex("href");
329                                     if (hrefIndex != -1)
330                                         attrs.setValue(hrefIndex, altHref);
331                                     else
332                                         attrs.addAttribute("", "href", "href", "CDATA", altHref);
333                                 }
334                             }
335                             startElement(startElement.localName, attrs);
336                             endElements.push(new EndElementInfo(startElement.localName));
337                         } else if (startElement.localName.equals("td") || startElement.localName.equals("th")) {
338                             AttributesImpl JavaDoc attrs = getAllowedAttributes(startElement);
339
340                             // remove dummy rowspan and colspan attributes
341
String JavaDoc rowspan = attrs.getValue("rowspan");
342                             if (rowspan != null && rowspan.equals("1")) {
343                                 attrs.removeAttribute(attrs.getIndex("rowspan"));
344                             }
345                             String JavaDoc colspan = attrs.getValue("colspan");
346                             if (colspan != null && colspan.equals("1")) {
347                                 attrs.removeAttribute(attrs.getIndex("colspan"));
348                             }
349
350                             startElement(startElement.localName, attrs);
351                             endElements.push(new EndElementInfo(startElement.localName));
352                         } else if (template.descriptors.containsKey(startElement.localName)) {
353                             startElement(startElement.localName, getAllowedAttributes(startElement));
354                             endElements.push(new EndElementInfo(startElement.localName));
355                         } else {
356                             // skip element
357
endElements.push(new EndElementInfo());
358                         }
359                     }
360                 } else if (bit instanceof SaxBuffer.EndElement) {
361                     XMLizable endElement = (XMLizable)endElements.pop();
362                     endElement.toSAX(contentHandler);
363                 } else if (bit instanceof SaxBuffer.Characters) {
364                     ((SaxBuffer.Characters)bit).send(contentHandler);
365                 } else if (bit instanceof SaxBuffer.StartDocument) {
366                     contentHandler.startDocument();
367                 } else if (bit instanceof SaxBuffer.EndDocument) {
368                     contentHandler.endDocument();
369                     // don't do any events after endDocument
370
return;
371                 }
372                 i++;
373             }
374         }
375
376         private AttributesImpl JavaDoc getAllowedAttributes(SaxBuffer.StartElement startElement) {
377             // limit attributes to the allowed attributes
378
String JavaDoc[] allowedAttributes = ((ElementDescriptor)template.descriptors.get(startElement.localName)).getAttributeNames();
379             AttributesImpl JavaDoc attrs = new AttributesImpl JavaDoc();
380             for (int k = 0; k < allowedAttributes.length; k++) {
381                 String JavaDoc value = startElement.attrs.getValue(allowedAttributes[k]);
382                 if (value != null) {
383                     attrs.addAttribute("", allowedAttributes[k], allowedAttributes[k], "CDATA", value);
384                 }
385             }
386             return attrs;
387         }
388
389         /**
390          * Puts p tags around all characters or elements that are child of html/body
391          * but are not allowed there.
392          */

393         private void introduceParas(List bits) throws SAXException JavaDoc {
394             Stack endElements = new Stack();
395             IntStack introducedParas = new IntStack();
396             ElementDescriptor bodyDescriptor = (ElementDescriptor)template.descriptors.get("body");
397             ElementDescriptor tdDescriptor = (ElementDescriptor)template.descriptors.get("td");
398             ElementDescriptor thDescriptor = (ElementDescriptor)template.descriptors.get("th");
399             ElementDescriptor paraDescriptor = (ElementDescriptor)template.descriptors.get("p");
400
401             int i = -1;
402             while (i < bits.size()) {
403                 i++;
404                 Object JavaDoc bit = bits.get(i);
405                 if (bit instanceof SaxBuffer.StartElement) {
406                     SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit;
407
408                     if (!introducedParas.empty() && introducedParas.peek() == 0 && !paraDescriptor.childAllowed(startElement.localName)) {
409                         endElement("p");
410                         introducedParas.pop();
411                     } else if (openElements.size() > 1) {
412                         StartElementInfo parentInfo = (StartElementInfo)openElements.get(openElements.size() - 1);
413                         String JavaDoc parentName = parentInfo.getName();
414                         boolean startPara =
415                                 (parentName.equals("body") && !bodyDescriptor.childAllowed(startElement.localName)
416                                 && paraDescriptor.childAllowed(startElement.localName))
417                                 ||
418                                 (parentName.equals("td") && !tdDescriptor.childAllowed(startElement.localName)
419                                 && paraDescriptor.childAllowed(startElement.localName))
420                                 ||
421                                 (parentName.equals("th") && !thDescriptor.childAllowed(startElement.localName)
422                                 && paraDescriptor.childAllowed(startElement.localName));
423
424                         if (startPara) {
425                             startElement("p", new AttributesImpl JavaDoc());
426                             introducedParas.push(0);
427                         }
428                     }
429
430                     if (!introducedParas.empty()) {
431                         introducedParas.push(introducedParas.pop() + 1);
432                     }
433
434                     startElement(startElement.localName, startElement.attrs);
435                     endElements.push(new EndElementInfo(startElement.localName));
436
437
438                 } else if (bit instanceof SaxBuffer.EndElement) {
439                     if (!introducedParas.empty() && introducedParas.peek() == 0) {
440                         endElement("p");
441                         introducedParas.pop();
442                     }
443
444                     XMLizable endElement = (XMLizable)endElements.pop();
445                     endElement.toSAX(contentHandler);
446
447                     if (!introducedParas.empty()) {
448                         introducedParas.push(introducedParas.pop() - 1);
449                     }
450                 } else if (bit instanceof SaxBuffer.Characters) {
451                     if (openElements.size() > 1) {
452                         StartElementInfo parentInfo = (StartElementInfo)openElements.get(openElements.size() - 1);
453                         String JavaDoc parentName = parentInfo.getName();
454                         boolean startPara = parentName.equals("body") || parentName.equals("td") || parentName.equals("th") || parentName.equals("blockquote");
455                         if (startPara) {
456                             startElement("p", new AttributesImpl JavaDoc());
457                             introducedParas.push(0);
458                         }
459                     }
460                     ((SaxBuffer.Characters)bit).send(contentHandler);
461                 } else if (bit instanceof SaxBuffer.StartDocument) {
462                     contentHandler.startDocument();
463                 } else if (bit instanceof SaxBuffer.EndDocument) {
464                     contentHandler.endDocument();
465                     // don't do any events after endDocument
466
return;
467                 }
468             }
469         }
470
471         /**
472          * Performs structural corrections, so that the end result is
473          * limited to what XHTML1 allows (or at least close to it).
474          */

475         private void structuralCleanup(List bits) throws SAXException JavaDoc {
476             Stack endElements = new Stack();
477
478             int i = -1;
479             while (i < bits.size()) {
480                 i++;
481                 Object JavaDoc bit = bits.get(i);
482                 if (bit instanceof SaxBuffer.StartElement) {
483                     SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit;
484
485                     ElementDescriptor descriptor = (ElementDescriptor)template.descriptors.get(startElement.localName);
486                     if (descriptor == null)
487                         throw new SAXException JavaDoc("Missing ElementDescriptor for tagname " + startElement.localName);
488
489                     // check if this element can occur inside its parent
490
if (openElements.size() > 0) {
491                         String JavaDoc parentElementName = ((StartElementInfo)openElements.get(openElements.size() - 1)).getName();
492                         ElementDescriptor parentDescriptor = (ElementDescriptor)template.descriptors.get(parentElementName);
493
494                         boolean allowed = parentDescriptor.childAllowed(startElement.localName);
495
496                         // if it's allowed, let's get it done and over with
497
if (allowed) {
498                             startElement(startElement.localName, startElement.attrs);
499                             endElements.push(new EndElementInfo(startElement.localName));
500                             continue;
501                         }
502
503                         // not allowed -> search for first parent where it is allowed
504
int firstGoodAncestor = -1;
505                         for (int k = openElements.size() - 2; k >= 0; k--) {
506                             String JavaDoc ancestorElementName = ((StartElementInfo)openElements.get(k)).getName();
507                             ElementDescriptor ancestorDescriptor = (ElementDescriptor)template.descriptors.get(ancestorElementName);
508                             if (ancestorDescriptor.childAllowed(startElement.localName)) {
509                                 firstGoodAncestor = k;
510                                 break;
511                             }
512                         }
513
514                         if (firstGoodAncestor == -1)
515                             throw new SAXException JavaDoc("Element \"" + startElement.localName + "\" is disallowed at its current location, and could not automatically fix this.");
516
517                         // close open elements to get to the allowed ancestor
518
MultiEndElementInfo endElementInfo = new MultiEndElementInfo();
519                         for (int k = openElements.size() - 1; k > firstGoodAncestor; k--) {
520                             endElementInfo.add((StartElementInfo)openElements.get(k));
521                         }
522                         endElementInfo.add(new EndElementInfo(startElement.localName));
523
524                         for (int k = openElements.size() - 1; k > firstGoodAncestor; k--) {
525                             endElement(((StartElementInfo)openElements.get(k)).getName());
526                         }
527
528                         startElement(startElement.localName, startElement.attrs);
529                         endElements.push(endElementInfo);
530
531                     } else {
532                         startElement(startElement.localName, startElement.attrs);
533                         endElements.push(new EndElementInfo(startElement.localName));
534                     }
535
536                 } else if (bit instanceof SaxBuffer.EndElement) {
537                     XMLizable endElement = (XMLizable)endElements.pop();
538                     endElement.toSAX(contentHandler);
539                 } else if (bit instanceof SaxBuffer.Characters) {
540                     ((SaxBuffer.Characters)bit).send(contentHandler);
541                 } else if (bit instanceof SaxBuffer.StartDocument) {
542                     contentHandler.startDocument();
543                 } else if (bit instanceof SaxBuffer.EndDocument) {
544                     contentHandler.endDocument();
545                     // don't do any events after endDocument
546
return;
547                 }
548             }
549         }
550
551         /**
552          * Removes p's, headers containing only whitespace or br's, changes sequences
553          * of more then two br's into a new paragraph, drops br's at start or
554          * end of p, headers.
555          */

556         private void cleanupBrsAndEmptyContentBlocks(List bits) throws SAXException JavaDoc {
557             Stack endElements = new Stack();
558
559             int i = -1;
560             while (i < bits.size()) {
561                 i++;
562                 Object JavaDoc bit = bits.get(i);
563                 if (bit instanceof SaxBuffer.StartElement) {
564                     SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit;
565
566                     boolean contentBlockElement = contentBlockElements.contains(startElement.localName);
567                     if (contentBlockElement || startElement.localName.equals("td") || startElement.localName.equals("th")) {
568                         // starting a new p, td, ...: search if this element contains anything non-whitespace non-br
569
int elementNesting = 0;
570                         int z = i;
571                         boolean reachedEndElement = false;
572                         while (true) {
573                             z++;
574                             Object JavaDoc bit2 = bits.get(z);
575                             if (bit2 instanceof SaxBuffer.Characters && isWhitespace((SaxBuffer.Characters)bit2)) {
576                                 // continue loop
577
} else if (bit2 instanceof SaxBuffer.StartElement
578                                     && ((SaxBuffer.StartElement)bit2).localName.equals("br")) {
579                                 elementNesting++;
580                             } else if (bit2 instanceof SaxBuffer.EndElement
581                                     && ((SaxBuffer.EndElement)bit2).localName.equals("br")) {
582                                 elementNesting--;
583                             } else if (bit2 instanceof SaxBuffer.EndElement && elementNesting == 0) {
584                                 reachedEndElement = true;
585                                 break;
586                             } else {
587                                 break;
588                             }
589                         }
590
591                         if (reachedEndElement) {
592                             if (contentBlockElement) {
593                                 // skip over this element
594
i = z;
595                                 continue;
596                             } else {
597                                 startElement(startElement.localName, startElement.attrs);
598                                 endElements.push(new EndElementInfo(startElement.localName));
599                                 // skip content of this element
600
i = z - 1;
601                                 continue;
602                             }
603                         } else {
604                             if (contentBlockElement) {
605                                 // skip over initial br's or whitespace at start of content block
606
i = z - 1;
607                             } else {
608                                 // nothing to do
609
}
610                         }
611
612                     } else if (startElement.localName.equals("br")) {
613                         // search for a parent content block element
614
int firstContentBlockAncestor = -1;
615                         for (int k = openElements.size() - 1; k >= 0; k--) {
616                             StartElementInfo startElementInfo = (StartElementInfo)openElements.get(k);
617                             if (contentBlockElements.contains(startElementInfo.getName())) {
618                                 firstContentBlockAncestor = k;
619                                 break;
620                             }
621                         }
622
623                         // if we are inside a content block ...
624
if (firstContentBlockAncestor != -1) {
625                             // count number of br's following this
626
int z = i;
627                             int brCount = 1;
628                             boolean continueSearch = true;
629                             while (continueSearch) {
630                                 z++;
631                                 Object JavaDoc bit2 = bits.get(z);
632                                 if (bit2 instanceof SaxBuffer.EndElement) {
633                                     String JavaDoc name = ((SaxBuffer.EndElement)bit2).localName;
634                                     if (!name.equals("br")) {
635                                         continueSearch = false;
636                                     }
637                                 } else if (bit2 instanceof SaxBuffer.StartElement
638                                     && ((SaxBuffer.StartElement)bit2).localName.equals("br")) {
639                                     brCount++;
640                                     continueSearch = true;
641                                 } else if (bit2 instanceof SaxBuffer.Characters && isWhitespace((SaxBuffer.Characters)bit2)) {
642                                     continueSearch = true;
643                                 } else {
644                                     continueSearch = false;
645                                 }
646                             }
647
648                             // if all the next bits till the first closing content block tag are either end elements or whitespace,
649
// then drop the br's.
650
boolean beforeEndContentBlock = false;
651                             for (int t = z; t < bits.size(); t++) {
652                                 if (bits.get(t) instanceof SaxBuffer.EndElement) {
653                                     SaxBuffer.EndElement endEl = (SaxBuffer.EndElement)bits.get(t);
654                                     if (contentBlockElements.contains(endEl.localName)) {
655                                         beforeEndContentBlock = true;
656                                         break;
657                                     }
658                                     // other end element events: continue searching
659
} else if (bits.get(t) instanceof SaxBuffer.Characters && isWhitespace((SaxBuffer.Characters)bits.get(t))) {
660                                     // whitespace: continue searching
661
} else {
662                                     // everything else: stop
663
break;
664                                 }
665                             }
666                             if (beforeEndContentBlock) {
667                                 i = z - 1;
668                                 continue;
669                             }
670
671                             if (brCount >= 2) {
672                                 // drop the br's, close content block element, open content block element
673
i = z - 1; // z is positioned on the first non-br, non-whitespace element following the br's
674

675                                 List elementsToRestart = new ArrayList();
676                                 for (int k = openElements.size() - 1; k >= firstContentBlockAncestor; k--) {
677                                     elementsToRestart.add(openElements.get(k));
678                                 }
679
680                                 for (int k = openElements.size() - 1; k >= firstContentBlockAncestor; k--) {
681                                     endElement(((StartElementInfo)openElements.get(k)).getName());
682                                 }
683
684                                 for (int k = elementsToRestart.size() - 1; k >= 0; k--) {
685                                     StartElementInfo startElementInfo = (StartElementInfo)elementsToRestart.get(k);
686                                     startElement(startElementInfo.getName(), startElementInfo.getAttrs());
687                                 }
688                                 continue;
689                             }
690                         } else if (startElement.localName.equals("br") && openElements.size() > 1
691                             && needsCleanupOfEndBrs.contains(((StartElementInfo)openElements.get(openElements.size() - 1)).getName())) {
692                             // this is useful to remove <br>s inside <td>s or <br>s at the end of <li>s like mozilla does
693
String JavaDoc elementName = ((StartElementInfo)openElements.get(openElements.size() - 1)).getName();
694
695                             boolean nextIsEndOfElement = false;
696                             int r = i + 1;
697                             for (; r < bits.size(); r++) {
698                                 Object JavaDoc nextBit = bits.get(r);
699                                 if (nextBit instanceof SaxBuffer.EndElement) {
700                                     SaxBuffer.EndElement endEl = (SaxBuffer.EndElement)nextBit;
701                                     if (endEl.localName.equals("br")) {
702                                         continue;
703                                     } else if (endEl.localName.equals(elementName)) {
704                                         nextIsEndOfElement = true;
705                                         break;
706                                     } else {
707                                         break;
708                                     }
709                                 } else if (nextBit instanceof SaxBuffer.Characters && isWhitespace((SaxBuffer.Characters)nextBit)) {
710                                     // do nothing
711
} else {
712                                     break;
713                                 }
714                             }
715
716                             if (nextIsEndOfElement) {
717                                 i = r - 1;
718                                 continue;
719                             }
720                         }
721                     }
722
723                     startElement(startElement.localName, startElement.attrs);
724                     endElements.push(new EndElementInfo(startElement.localName));
725
726
727                 } else if (bit instanceof SaxBuffer.EndElement) {
728                     XMLizable endElement = (XMLizable)endElements.pop();
729                     endElement.toSAX(contentHandler);
730                 } else if (bit instanceof SaxBuffer.Characters) {
731                     ((SaxBuffer.Characters)bit).send(contentHandler);
732                 } else if (bit instanceof SaxBuffer.StartDocument) {
733                     contentHandler.startDocument();
734                 } else if (bit instanceof SaxBuffer.EndDocument) {
735                     contentHandler.endDocument();
736                     // don't do any events after endDocument
737
return;
738                 }
739             }
740         }
741
742         private void cleanupWipeableEmptyElements(List bits) throws SAXException JavaDoc {
743             Stack endElements = new Stack();
744
745             int i = -1;
746             while (i < bits.size()) {
747                 i++;
748                 Object JavaDoc bit = bits.get(i);
749                 if (bit instanceof SaxBuffer.StartElement) {
750                     SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit;
751                     if (wipeableEmptyElements.contains(startElement.localName)) {
752                         boolean hasWhitespace = false;
753                         boolean reachedEndElement = false;
754                         int elementNesting = 0;
755                         int k = i;
756                         while (true) {
757                             k++;
758                             Object JavaDoc nextBit = bits.get(k);
759                             if (nextBit instanceof SaxBuffer.StartElement && wipeableEmptyElements.contains(((SaxBuffer.StartElement)nextBit).localName)) {
760                                 elementNesting++;
761                             } else if (nextBit instanceof SaxBuffer.Characters && isWhitespace((SaxBuffer.Characters)nextBit)) {
762                                 hasWhitespace = true;
763                             } else if (nextBit instanceof SaxBuffer.EndElement && elementNesting > 0) {
764                                 elementNesting--;
765                             } else if (nextBit instanceof SaxBuffer.EndElement && elementNesting == 0) {
766                                 reachedEndElement = true;
767                                 break;
768                             } else {
769                                 break;
770                             }
771                         }
772
773                         if (reachedEndElement) {
774                             // skip the elements
775
i = k;
776                             // if the wipeable elements contained whitespace, generate a whitespace character
777
if (hasWhitespace)
778                                 contentHandler.characters(new char[] { ' ' }, 0, 1);
779                             continue;
780                         }
781                     }
782
783                     startElement(startElement.localName, startElement.attrs);
784                     endElements.push(new EndElementInfo(startElement.localName));
785
786                 } else if (bit instanceof SaxBuffer.EndElement) {
787                     XMLizable endElement = (XMLizable)endElements.pop();
788                     endElement.toSAX(contentHandler);
789                 } else if (bit instanceof SaxBuffer.Characters) {
790                     ((SaxBuffer.Characters)bit).send(contentHandler);
791                 } else if (bit instanceof SaxBuffer.StartDocument) {
792                     contentHandler.startDocument();
793                 } else if (bit instanceof SaxBuffer.EndDocument) {
794                     contentHandler.endDocument();
795                     // don't do any events after endDocument
796
return;
797                 }
798             }
799         }
800
801         /**
802          * Changes br elements inside pre elements into newline character events.
803          */

804         private void translateBeeaarsInPees(List bits) throws SAXException JavaDoc {
805             int preLevel = 0;
806             int i = -1;
807             while (i < bits.size()) {
808                 i++;
809                 Object JavaDoc bit = bits.get(i);
810                 if (bit instanceof SaxBuffer.StartElement) {
811                     SaxBuffer.StartElement startElement = (SaxBuffer.StartElement)bit;
812                     if (startElement.localName.equals("pre")) {
813                         preLevel++;
814                     } else if (preLevel > 0 && startElement.localName.equals("br")) {
815                         // normally an opening br should be immediatelly followed by the closing br,
816
// so let us restrict us to that case
817
Object JavaDoc nextBit = bits.get(i + 1);
818                         if (nextBit instanceof SaxBuffer.EndElement && ((SaxBuffer.EndElement)nextBit).localName.equals("br")) {
819                             // replace this br by a newline
820
contentHandler.characters(NEWLINE, 0, 1);
821                             i++;
822                             continue;
823                         }
824                     }
825
826                     startElement(startElement.localName, startElement.attrs);
827
828                 } else if (bit instanceof SaxBuffer.EndElement) {
829                     SaxBuffer.EndElement endElement = (SaxBuffer.EndElement)bit;
830                     if (endElement.localName.equals("pre")) {
831                         preLevel--;
832                     }
833                     contentHandler.endElement(endElement.namespaceURI, endElement.localName, endElement.qName);
834                 } else if (bit instanceof SaxBuffer.Characters) {
835                     ((SaxBuffer.Characters)bit).send(contentHandler);
836                 } else if (bit instanceof SaxBuffer.StartDocument) {
837                     contentHandler.startDocument();
838                 } else if (bit instanceof SaxBuffer.EndDocument) {
839                     contentHandler.endDocument();
840                     // don't do any events after endDocument
841
return;
842                 }
843             }
844         }
845
846         private boolean isWhitespace(SaxBuffer.Characters characters) {
847             for (int i = 0; i < characters.ch.length; i++) {
848                 if (!(Character.isWhitespace(characters.ch[i]) || characters.ch[i] == (char)160)) // 160 is &nbsp;
849
return false;
850             }
851             return true;
852         }
853
854         private void startElement(String JavaDoc name, Attributes JavaDoc attrs) throws SAXException JavaDoc {
855             contentHandler.startElement("", name, name, attrs);
856             openElements.add(new StartElementInfo(name, attrs));
857         }
858
859         private void endElement(String JavaDoc name) throws SAXException JavaDoc {
860             contentHandler.endElement("", name, name);
861             String JavaDoc removed = ((StartElementInfo)openElements.remove(openElements.size() - 1)).getName();
862             if (!removed.equals(name)) {
863                 throw new SAXException JavaDoc("The close tag \"" + name + "\" did not match the open tag \"" + removed + "\".");
864             }
865         }
866
867         private class StartElementInfo implements XMLizable {
868             private final String JavaDoc name;
869             private final Attributes JavaDoc attrs;
870
871             public StartElementInfo(String JavaDoc name, Attributes JavaDoc attrs) {
872                 this.name = name;
873                 this.attrs = attrs;
874             }
875
876             public String JavaDoc getName() {
877                 return name;
878             }
879
880             public Attributes JavaDoc getAttrs() {
881                 return attrs;
882             }
883
884             public void toSAX(ContentHandler JavaDoc contentHandler) throws SAXException JavaDoc {
885                 startElement(name, attrs);
886             }
887         }
888
889         private class EndElementInfo implements XMLizable {
890             private final boolean skip;
891             private final String JavaDoc localName;
892
893             public EndElementInfo() {
894                 this.skip = true;
895                 this.localName = null;
896             }
897
898             public EndElementInfo(String JavaDoc localName) {
899                 this.skip = false;
900                 this.localName = localName;
901             }
902
903             public void toSAX(ContentHandler JavaDoc contentHandler) throws SAXException JavaDoc {
904                 if (!skip) {
905                     endElement(localName);
906                 }
907             }
908         }
909
910         private final class MultiEndElementInfo implements XMLizable {
911             private ArrayList tags = new ArrayList(2);
912
913             public void add(EndElementInfo endElement) {
914                 this.tags.add(endElement);
915             }
916
917             public void add(StartElementInfo endElement) {
918                 this.tags.add(endElement);
919             }
920
921             public void toSAX(ContentHandler JavaDoc contentHandler) throws SAXException JavaDoc {
922                 for (int i = tags.size() - 1; i >= 0; i--) {
923                     XMLizable tag = (XMLizable)tags.get(i);
924                     tag.toSAX(contentHandler);
925                 }
926             }
927         }
928
929     }
930
931     interface XMLizable {
932         public void toSAX(ContentHandler JavaDoc contentHandler) throws SAXException JavaDoc;
933     }
934
935     static class IntStack {
936         private Stack stack = new Stack();
937
938         public void push(int value) {
939             stack.push(new Integer JavaDoc(value));
940         }
941
942         public int pop() {
943             return ((Integer JavaDoc)stack.pop()).intValue();
944         }
945
946         public boolean empty() {
947             return stack.empty();
948         }
949
950         public int peek() {
951             return ((Integer JavaDoc)stack.peek()).intValue();
952         }
953     }
954 }
955
Popular Tags