KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > htmlcleaner > StylingHtmlSerializer


1 /*
2  * Copyright 2004 Outerthought bvba and Schaubroeck nv
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.outerj.daisy.htmlcleaner;
17
18 import org.xml.sax.SAXException JavaDoc;
19 import org.xml.sax.Locator JavaDoc;
20 import org.xml.sax.Attributes JavaDoc;
21 import org.xml.sax.ContentHandler JavaDoc;
22
23 import java.io.OutputStream JavaDoc;
24 import java.io.IOException JavaDoc;
25 import java.io.Writer JavaDoc;
26 import java.io.OutputStreamWriter JavaDoc;
27 import java.util.*;
28
29 /**
30  * A special serializer for outputting XML-well-formed HTML.
31  *
32  * <p>This serializer is not meant as a general purpose serializer. Rather
33  * it is used as part of a HTML cleaning pipeline whose goal is to normalize
34  * html before storing it in a CMS.<p>
35  *
36  * <p>This serializer isn't designed or tested for performance, but that doesn't matter since it's
37  * only used for update operations and for relatively small content blurbs.</p>
38  *
39  * <p>The serializer will limit the output width to a certain number of characters.
40  * It can be configured to output a variable number of characters around the start
41  * and end tags of certain elements. Sequences of multiple whitespace characters
42  * are collapsed.</p>
43  *
44  * <p>The input must contain the html root tag.</p>
45  *
46  * <p>The output encoding is always UTF-8. Note that this can't simply be changed
47  * because the serializer currently doesn't check whether characters can be
48  * outputted in the given encoding (UTF-8 supports all of unicode, so such
49  * checks are not required). The serializer does not check for characters
50  * that are illegal in XML.</p>
51  */

52 class StylingHtmlSerializer implements ContentHandler JavaDoc {
53     private Writer JavaDoc writer;
54     private LineRenderer line;
55     private StartElementInfo currentStartElement;
56     private boolean inPreElement = false;
57     private HtmlCleanerTemplate template;
58     private OutputElementDescriptor dummy = new OutputElementDescriptor(0, 0, 0, 0, true);
59
60     public StylingHtmlSerializer(HtmlCleanerTemplate template) {
61         this.template = template;
62     }
63
64     public void setOutputStream(OutputStream JavaDoc outputStream) throws IOException JavaDoc {
65         // Note: this serializer assumes hardcoded UTF-8. Otherwise it would need extra
66
// functionality like character escaping etc.
67
this.writer = new OutputStreamWriter JavaDoc(outputStream, "UTF-8");
68         line = new LineRenderer();
69         currentStartElement = null;
70     }
71
72     public void startDocument() throws SAXException JavaDoc {
73     }
74
75     public void endDocument() throws SAXException JavaDoc {
76         try {
77             line.flushLine(false);
78             writer.flush();
79         } catch (IOException JavaDoc e) {
80             throw new SAXException JavaDoc(e);
81         }
82     }
83
84     public void characters(char ch[], int start, int length) throws SAXException JavaDoc {
85         writePendingStartElement(false);
86         line.writeText(escapeReservedCharacters(new String JavaDoc(ch, start, length)));
87     }
88
89     public void startElement(String JavaDoc namespaceURI, String JavaDoc localName, String JavaDoc qName, Attributes JavaDoc atts) throws SAXException JavaDoc {
90         writePendingStartElement(false);
91         currentStartElement = new StartElementInfo(localName, atts);
92         if (localName.equals("pre")) {
93             line.flushLine(false);
94             OutputElementDescriptor descriptor = getElementDescriptor("pre");
95             line.newLines(descriptor.getNewLinesBeforeOpenTag());
96             inPreElement = true;
97         }
98     }
99
100     public void writePendingStartElement(boolean empty) throws SAXException JavaDoc {
101         if (currentStartElement != null) {
102             String JavaDoc localName = currentStartElement.getLocalName();
103             Attributes JavaDoc atts = currentStartElement.getAttrs();
104
105             StringBuffer JavaDoc tag = new StringBuffer JavaDoc(localName.length() + 2 + (atts.getLength() * 50));
106             tag.append('<').append(localName);
107
108             if (atts.getLength() > 0) {
109                 for (int i = 0; i < atts.getLength(); i++) {
110                     tag.append(' ');
111                     tag.append(atts.getLocalName(i));
112                     tag.append("=\"");
113                     tag.append(escapeAttribute(atts.getValue(i)));
114                     tag.append('"');
115                 }
116             }
117
118             if (empty) {
119                 tag.append("/>");
120             } else {
121                 tag.append('>');
122             }
123
124             OutputElementDescriptor descriptor = getElementDescriptor(localName);
125             if (!inPreElement)
126                 line.newLines(descriptor.getNewLinesBeforeOpenTag());
127             line.writeStartTag(tag.toString(), descriptor);
128             if (!inPreElement) {
129                 if (empty)
130                     line.newLines(descriptor.getNewLinesAfterCloseTag());
131                 else
132                     line.newLines(descriptor.getNewLinesAfterOpenTag());
133             }
134
135             if (localName.equals("pre"))
136                 line.flushLine(false);
137
138             currentStartElement = null;
139         }
140     }
141
142     public void endElement(String JavaDoc namespaceURI, String JavaDoc localName, String JavaDoc qName) throws SAXException JavaDoc {
143         if (localName.equals("pre"))
144             inPreElement = false;
145
146         if (currentStartElement != null) {
147             writePendingStartElement(true);
148         } else {
149             String JavaDoc tag = "</" + localName + ">";
150             OutputElementDescriptor descriptor = getElementDescriptor(localName);
151             if (!inPreElement)
152                 line.newLines(descriptor.getNewLinesBeforeCloseTag());
153             line.writeEndTag(tag, descriptor);
154             if (!inPreElement)
155                 line.newLines(descriptor.getNewLinesAfterCloseTag());
156         }
157     }
158
159     public void ignorableWhitespace(char ch[], int start, int length) throws SAXException JavaDoc {
160     }
161
162     public void endPrefixMapping(String JavaDoc prefix) throws SAXException JavaDoc {
163     }
164
165     public void skippedEntity(String JavaDoc name) throws SAXException JavaDoc {
166     }
167
168     public void setDocumentLocator(Locator JavaDoc locator) {
169     }
170
171     public void processingInstruction(String JavaDoc target, String JavaDoc data) throws SAXException JavaDoc {
172     }
173
174     public void startPrefixMapping(String JavaDoc prefix, String JavaDoc uri) throws SAXException JavaDoc {
175     }
176
177     public void endCDATA() throws SAXException JavaDoc {
178     }
179
180     public void endDTD() throws SAXException JavaDoc {
181     }
182
183     public void startCDATA() throws SAXException JavaDoc {
184     }
185
186     public void comment(char ch[], int start, int length) throws SAXException JavaDoc {
187     }
188
189     public void endEntity(String JavaDoc name) throws SAXException JavaDoc {
190     }
191
192     public void startEntity(String JavaDoc name) throws SAXException JavaDoc {
193     }
194
195     public void startDTD(String JavaDoc name, String JavaDoc publicId, String JavaDoc systemId) throws SAXException JavaDoc {
196     }
197
198     private OutputElementDescriptor getElementDescriptor(String JavaDoc localName) {
199         OutputElementDescriptor descriptor = (OutputElementDescriptor)template.outputElementDescriptors.get(localName);
200         if (descriptor != null)
201             return descriptor;
202         return dummy;
203     }
204
205     /**
206      * Escapes an attribute value assuming it is quoted in double quotes.
207      */

208     private String JavaDoc escapeAttribute(String JavaDoc value) {
209         StringBuffer JavaDoc newValue = new StringBuffer JavaDoc(value.length() + 10);
210         for (int i = 0; i < value.length(); i++) {
211             char c = value.charAt(i);
212             switch (c) {
213                 case '"':
214                     newValue.append("&quot;");
215                     break;
216                 case '<': // strictly spoken only needed after ]]
217
newValue.append("&lt;");
218                     break;
219                 case '>':
220                     newValue.append("&gt;");
221                     break;
222                 case '&':
223                     newValue.append("&amp;");
224                     break;
225                 default:
226                     newValue.append(c);
227             }
228         }
229         return newValue.toString();
230     }
231
232     private String JavaDoc escapeReservedCharacters(String JavaDoc text) {
233         StringBuffer JavaDoc newText = new StringBuffer JavaDoc(text.length() + 10);
234         for (int i = 0; i < text.length(); i++) {
235             char c = text.charAt(i);
236             switch (c) {
237                 case '<':
238                     newText.append("&lt;");
239                     break;
240                 case '>': // strictly spoken only needed after ]]
241
newText.append("&gt;");
242                     break;
243                 case '&':
244                     newText.append("&amp;");
245                     break;
246                 default:
247                     // if the character is not in the range allowed by XML, simply skip it
248
// (Note: neko html doesn't removed these characters when parsing, so we do it here)
249
if (c == 0x9 || c == 0xA || c == 0xD || (c >= 0x20 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD)
250                             || (c >= 0x10000 && c <= 0x10FFFF))
251                         newText.append(c);
252             }
253         }
254         return newText.toString();
255     }
256
257     /**
258      * Class that keeps track of the current line being written, and whether
259      * a new line should be started and such. All output should happen via
260      * this class.
261      */

262     private class LineRenderer {
263         private Line line = new Line();
264
265         /**
266          * Outputs the given number of newlines.
267          */

268         public void newLines(int count) throws SAXException JavaDoc {
269             try {
270                 if (count == 0)
271                     return;
272
273                 if (line.getLength() > 0)
274                     flushLine(false);
275
276                 if (count == 1)
277                     writer.write('\n');
278                 else
279                     for (int i = 0; i < count; i++)
280                         writer.write('\n');
281             } catch (IOException JavaDoc e) {
282                 throw new SAXException JavaDoc(e);
283             }
284         }
285
286         public void writeText(String JavaDoc text) throws SAXException JavaDoc {
287             try {
288                 if (inPreElement) {
289                     writer.write(text);
290                 } else {
291                     List words = getWords(text);
292
293                     if (startsWithWhitespace(text))
294                         line.addSpace();
295
296                     if (words.size() > 0) {
297
298                         Iterator wordsIt = words.iterator();
299                         boolean firstWord = true;
300
301                         while (wordsIt.hasNext()) {
302                             String JavaDoc word = (String JavaDoc)wordsIt.next();
303                             if (line.getLength() > 0 && line.getLength() + word.length() + 1 > template.maxLineWidth) {
304                                 if (!line.endsOnWordOrSpace())
305                                     writeText(line.emptyIfPossibleBeforeWord(), true);
306                                 else
307                                     writeText(line.empty(), true);
308                             }
309
310                             if (!firstWord)
311                                 line.addSpace();
312                             line.addWord(word);
313
314                             firstWord = false;
315                         }
316
317                         if (endsWithWhitespace(text))
318                             line.addSpace();
319                     }
320                 }
321             } catch (IOException JavaDoc e) {
322                 throw new SAXException JavaDoc(e);
323             }
324         }
325
326         private void writeText(String JavaDoc text, boolean newLine) throws IOException JavaDoc {
327             if (text != null) {
328                 writer.write(text);
329                 if (newLine)
330                     writer.write('\n');
331             }
332         }
333
334         private void flushLine(boolean newLine) throws SAXException JavaDoc {
335             try {
336                 writeText(line.empty(), newLine);
337             } catch (IOException JavaDoc e) {
338                 throw new SAXException JavaDoc(e);
339             }
340         }
341
342         private boolean startsWithWhitespace(String JavaDoc text) {
343             if (text.length() == 0)
344                 return false;
345             return Character.isWhitespace(text.charAt(0));
346         }
347
348         private boolean endsWithWhitespace(String JavaDoc text) {
349             if (text.length() == 0)
350                 return false;
351             return Character.isWhitespace(text.charAt(text.length() - 1));
352         }
353
354         public void writeStartTag(String JavaDoc text, OutputElementDescriptor descriptor) throws SAXException JavaDoc {
355             try {
356                 if (inPreElement) {
357                     writer.write(text);
358                 } else {
359                     // if the line is full
360
if (line.getLength() > 0 && line.getLength() + 1 + text.length() > template.maxLineWidth) {
361                         String JavaDoc toWrite = null;
362                         if (descriptor.isInline())
363                             toWrite = line.emptyIfPossibleBeforeInlineTag();
364                         else
365                             toWrite = line.empty();
366                         writeText(toWrite, true);
367                     }
368
369                     line.addStartTag(text, descriptor);
370                 }
371             } catch (IOException JavaDoc e) {
372                 throw new SAXException JavaDoc(e);
373             }
374         }
375
376         public void writeEndTag(String JavaDoc text, OutputElementDescriptor descriptor) throws SAXException JavaDoc {
377             try {
378                 if (inPreElement) {
379                     writer.write(text);
380                 } else {
381                     // if the line is full
382
if (line.getLength() > 0 && line.getLength() + text.length() > template.maxLineWidth) {
383                         String JavaDoc toWrite = null;
384                         if (descriptor.isInline())
385                             toWrite = line.emptyIfPossibleBeforeInlineTag();
386                         else
387                             toWrite = line.empty();
388                         writeText(toWrite, true);
389                     }
390
391                     line.addEndTag(text, descriptor);
392                 }
393             } catch (IOException JavaDoc e) {
394                 throw new SAXException JavaDoc(e);
395             }
396         }
397
398         private List getWords(String JavaDoc text) {
399             ArrayList words = new ArrayList();
400             int beginWord = -1;
401             for (int i = 0; i < text.length(); i++) {
402                 if (Character.isWhitespace(text.charAt(i))) {
403                     if (beginWord != -1) {
404                         String JavaDoc newWord = text.substring(beginWord, i);
405                         words.add(newWord);
406                         beginWord = -1;
407                     }
408                 } else if (beginWord == -1) {
409                     beginWord = i;
410                 }
411             }
412
413             if (beginWord != -1) {
414                 String JavaDoc newWord = text.substring(beginWord);
415                 words.add(newWord);
416             }
417
418             return words;
419         }
420     }
421
422     /**
423      * The Line class keeps track of which items are on the current line,
424      * and helps finding where the line can be split if it becomes to long.
425      * Breaks are only allowed at whitespace locations or at 'block' tags,
426      * NOT where two inline tags or an inline tag and a word are next to
427      * each other whithout whitespace between them.
428      */

429     private class Line {
430         private List lineItems = new ArrayList();
431         private int length = 0;
432
433         public void addStartTag(String JavaDoc text, OutputElementDescriptor descriptor) {
434             lineItems.add(new StartTag(text, descriptor));
435             length += text.length();
436         }
437
438         public void addEndTag(String JavaDoc text, OutputElementDescriptor descriptor) {
439             if (!descriptor.isInline() && lineItems.size() > 0 && getLastLineItem() instanceof Space) {
440                 lineItems.remove(lineItems.size() - 1);
441                 length--; // one space removed
442
}
443             lineItems.add(new EndTag(text, descriptor));
444             length += text.length();
445         }
446
447         public void addWord(String JavaDoc text) {
448             lineItems.add(new Word(text));
449             length += text.length();
450         }
451
452         public void addSpace() {
453             boolean addSpace = true;
454
455             if (lineItems.size() > 0) {
456                 LineItem lastItem = getLastLineItem();
457                 if (lastItem instanceof Space)
458                     addSpace = false;
459                 else if (lastItem instanceof Tag && !((Tag)lastItem).descriptor.isInline())
460                     addSpace = false;
461             } else if (lineItems.size() == 0) {
462                 addSpace = false;
463             }
464
465             if (addSpace) {
466                 lineItems.add(new Space());
467                 length += 1;
468             }
469         }
470
471         public boolean endsOnWordOrSpace() {
472             if (lineItems.size() > 0) {
473                 LineItem lineItem = getLastLineItem();
474                 return lineItem instanceof Word || lineItem instanceof Space;
475             } else {
476                 return false;
477             }
478         }
479
480         public int getLength() {
481             return length;
482         }
483
484         public String JavaDoc empty() {
485             return empty(lineItems.size() - 1);
486         }
487
488         /**
489          *
490          * @param until index of last item to be included
491          */

492         public String JavaDoc empty(int until) {
493             StringBuffer JavaDoc text = new StringBuffer JavaDoc();
494             int lastPos = until;
495             for (int i = 0; i <= until; i++) {
496                 LineItem lineItem = (LineItem)lineItems.get(i);
497                 if (i == lastPos && lineItem instanceof Space)
498                     continue;
499                 text.append(lineItem.text);
500             }
501             lineItems = new ArrayList(lineItems.subList(until + 1, lineItems.size()));
502             recalcLength();
503             return text.toString();
504         }
505
506         private void recalcLength() {
507             int newLength = 0;
508             for (int i = 0; i < lineItems.size(); i++) {
509                 newLength += ((LineItem)lineItems.get(i)).text.length();
510             }
511             this.length = newLength;
512         }
513
514         /**
515          * If a word it to be added, but it doesn't fit anymore on the line, this method
516          * is used to try to empty the line, either completely or as far as possible, depending
517          * on whether there is an apropriate place to split the line.
518          *
519          * @return the text to be written out, or null if none
520          */

521         public String JavaDoc emptyIfPossibleBeforeWord() {
522             LineItem lineItem = getLastLineItem();
523             if (lineItem instanceof Tag && ((Tag)lineItem).descriptor.isInline()) {
524                 int splitPoint = searchSplitPoint();
525                 if (splitPoint == -1)
526                     return null;
527                 else
528                     return empty(splitPoint);
529             } else {
530                 return empty();
531             }
532         }
533
534         public String JavaDoc emptyIfPossibleBeforeInlineTag() {
535             LineItem lineItem = getLastLineItem();
536             if ((lineItem instanceof Tag && ((Tag)lineItem).descriptor.isInline()) || lineItem instanceof Word) {
537                 int splitPoint = searchSplitPoint();
538                 if (splitPoint == -1)
539                     return null;
540                 else
541                     return empty(splitPoint);
542             } else {
543                 return empty();
544             }
545         }
546
547         /**
548          * This method should only be called if the last item on the line is a Word
549          * or a inline Tag.
550          *
551          * Returns -1 if there's not suitable split point, otherwise returns the index
552          * of the last item before the possible split point.
553          */

554         private int searchSplitPoint() {
555             if (lineItems.size() < 2) {
556                 // there is only one item on the line, and it cannot be disconnected from the next item
557
return -1;
558             }
559
560             LineItem previousLineItem = getLastLineItem();
561             for (int i = lineItems.size() - 2; i >= 0; i--) {
562                 LineItem currentLineItem = (LineItem)lineItems.get(i);
563                 if (currentLineItem instanceof Word && previousLineItem instanceof Word) {
564                     // between two words, we can split
565
return i;
566                 } else if (currentLineItem instanceof Space) {
567                     return i;
568                 } else if (currentLineItem instanceof Tag && !((Tag)currentLineItem).descriptor.isInline()) {
569                     return i;
570                 }
571                 previousLineItem = currentLineItem;
572             }
573             return -1;
574         }
575
576         private LineItem getLastLineItem() {
577             return (LineItem)lineItems.get(lineItems.size() - 1);
578         }
579
580         abstract class LineItem {
581             final String JavaDoc text;
582
583             public LineItem(String JavaDoc text) {
584                 this.text = text;
585             }
586         }
587
588         abstract class Tag extends LineItem {
589             final OutputElementDescriptor descriptor;
590
591             public Tag(String JavaDoc text, OutputElementDescriptor descriptor) {
592                 super(text);
593                 this.descriptor = descriptor;
594             }
595         }
596
597         class StartTag extends Tag {
598             public StartTag(String JavaDoc text, OutputElementDescriptor descriptor) {
599                 super(text, descriptor);
600             }
601         }
602
603         class EndTag extends Tag {
604             public EndTag(String JavaDoc text, OutputElementDescriptor descriptor) {
605                 super(text, descriptor);
606             }
607         }
608
609         class Word extends LineItem {
610
611             public Word(String JavaDoc text) {
612                 super(text);
613             }
614         }
615
616         class Space extends LineItem {
617             public Space() {
618                 super(" ");
619             }
620         }
621     }
622
623     private static class StartElementInfo {
624         private final String JavaDoc localName;
625         private final Attributes JavaDoc attrs;
626
627         public StartElementInfo(String JavaDoc localName, Attributes JavaDoc attrs) {
628             this.localName = localName;
629             this.attrs = attrs;
630         }
631
632         public String JavaDoc getLocalName() {
633             return localName;
634         }
635
636         public Attributes JavaDoc getAttrs() {
637             return attrs;
638         }
639     }
640 }
641
Popular Tags