Serializer


1   /* Copyright 2002-2005 Elliotte Rusty Harold
2      
3      This library is free software; you can redistribute it and/or modify
4      it under the terms of version 2.1 of the GNU Lesser General Public 
5      License as published by the Free Software Foundation.
6      
7      This library is distributed in the hope that it will be useful,
8      but WITHOUT ANY WARRANTY; without even the implied warranty of
9      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
10     GNU Lesser General Public License for more details.
11     
12     You should have received a copy of the GNU Lesser General Public
13     License along with this library; if not, write to the 
14     Free Software Foundation, Inc., 59 Temple Place, Suite 330, 
15     Boston, MA 02111-1307  USA
16     
17     You can contact Elliotte Rusty Harold by sending e-mail to
18     elharo@metalab.unc.edu. Please include the word "XOM" in the
19     subject line. The XOM home page is located at http://www.xom.nu/
20  */
21  
22  package nu.xom;
23  
24  import java.io.BufferedWriter  ;
25  import java.io.IOException  ;
26  import java.io.OutputStream  ;
27  import java.io.OutputStreamWriter  ;
28  import java.io.UnsupportedEncodingException  ;
29  import java.io.Writer  ;
30  import java.util.Locale  ;
31  
32  /**
33   * <p>
34   *  Outputs a <code>Document</code> object in a specific encoding using
35   *  various options for controlling white space, normalization,
36   *  indenting, line breaking, and base URIs. However, in general these 
37   *  options do affect the document's infoset. In particular, if you set 
38   *  either the maximum line length or the indent size to a positive  
39   *  value, then the serializer will not respect input white space. It 
40   *  may trim leading and trailing space, condense runs of white 
41   *  space to a single space, convert carriage  returns and line 
42   *  feeds to spaces, add extra space where none was present before, 
43   *  and otherwise muck with the document's white space. 
44   *  The defaults, however, preserve all significant white space
45   *  including ignorable white space and boundary white space.
46   * </p>
47   * 
48   * @author Elliotte Rusty Harold
49   * @version 1.0
50   * 
51   */
52  public class Serializer {
53  
54      private TextWriter escaper;
55      private boolean preserveBaseURI = false;
56  
57      
58      /**
59       * <p>
60       * Create a new serializer that uses the UTF-8 encoding.
61       * </p>
62       * 
63       * @param out the output stream to write the document on
64       * 
65       * @throws NullPointerException if <code>out</code> is null
66       */
67      public Serializer(OutputStream   out) {
68          
69          if (out == null) {
70              throw new NullPointerException  ("Null OutputStream");
71          } 
72          try {
73              Writer   writer = new OutputStreamWriter  (out, "UTF8");
74              writer = new BufferedWriter  (writer);
75              escaper = TextWriterFactory.getTextWriter(writer, "UTF-8");
76          }
77          catch (UnsupportedEncodingException   ex) {
78              throw new RuntimeException  (
79                "The VM is broken. It does not understand UTF-8.");
80          }
81          
82      }
83      
84      
85      /**
86       * <p>
87       * Create a new serializer that uses the specified encoding.
88       * The encoding must be recognized by the Java virtual machine. If 
89       * you attempt to use an encoding that the local Java virtual 
90       * machine does not support, the constructor will throw an 
91       * <code>UnsupportedEncodingException</code>.
92       * Currently the following encodings are recognized by XOM:
93       * </p>
94       * 
95       * <ul>
96       *   <li>UTF-8</li>
97       *   <li>UTF-16</li>
98       *   <li>UTF-16BE</li>
99       *   <li>UTF-16LE</li>
100      *   <li>ISO-10646-UCS-2</li>
101      *   <li>ISO-8859-1</li>
102      *   <li>ISO-8859-2</li>
103      *   <li>ISO-8859-3</li>
104      *   <li>ISO-8859-4</li>
105      *   <li>ISO-8859-5</li>
106      *   <li>ISO-8859-6</li>
107      *   <li>ISO-8859-7</li>
108      *   <li>ISO-8859-8</li>
109      *   <li>ISO-8859-9</li>
110      *   <li>ISO-8859-10</li>
111      *   <li>ISO-8859-11 (a.k.a. TIS-620)</li>
112      *   <li>ISO-8859-13</li>
113      *   <li>ISO-8859-14</li>
114      *   <li>ISO-8859-15</li>
115      *   <li>ISO-8859-16</li>
116      *   <li>IBM037 (a.k.a. CP037, EBCDIC-CP-US, EBCDIC-CP-CA, 
117      *         EBCDIC-CP-WA, EBCDIC-CP-NL, and CSIBM037)</li>
118      *   <li>GB18030</li>
119      * </ul>
120      * 
121      * <p>
122      * You can use encodings not in this list if the virtual
123      * machine supports them. However, they may be
124      * significantly slower than the encodings in this list.
125      * </p>
126      * 
127      * <p>
128      * I've noticed Java has significant bugs in its handling of some
129      * of these encodings. In some cases such as 0x80 in Big5, XOM
130      * will escape a character that should not need to be escaped
131      * because Java can't output that character in the specified 
132      * encoding, even though the output character set does contain it.
133      * :-(
134      * </p>
135      * 
136      * @param out the output stream to write the document on
137      * @param encoding the character encoding for the serialization
138 
139      * @throws NullPointerException if <code>out</code> 
140      *     or <code>encoding</code> is null
141      * @throws UnsupportedEncodingException if the VM does not 
142      *     support the requested encoding
143      *  
144      */
145     public Serializer(OutputStream   out, String   encoding)
146       throws UnsupportedEncodingException   {
147         
148         if (out == null) {
149             throw new NullPointerException  ("Null OutputStream");
150         } 
151         if (encoding == null) {
152             throw new NullPointerException  ("Null encoding");
153         } 
154         
155         this.setOutputStream(out, encoding);
156         
157     }
158     
159     
160     /**
161      * <p>
162      * Flushes the previous output stream and 
163      * redirects further output to the new output stream.
164      * </p>
165      * 
166      * 
167      * @param out the output stream to write the document on
168 
169      * @throws NullPointerException if <code>out</code> is null
170      * @throws IOException if the previous output stream 
171      *     encounters an I/O error when flushed
172      *  
173      */
174     public void setOutputStream(OutputStream   out) 
175       throws IOException   {
176         
177         // flush any data onto the old output stream
178         this.flush();
179         int maxLength = getMaxLength();
180         int indent = this.getIndent();
181         String   lineSeparator = getLineSeparator();
182         boolean nfc = getUnicodeNormalizationFormC(); 
183         String   encoding = escaper.getEncoding();
184         setOutputStream(out, encoding);   
185         setIndent(indent);
186         setMaxLength(maxLength);
187         setUnicodeNormalizationFormC(nfc);
188         setLineSeparator(lineSeparator); 
189         
190     }
191 
192     
193     private void setOutputStream(OutputStream   out, String   encoding)
194         throws UnsupportedEncodingException   {
195         
196         Writer   writer;  
197         String   encodingUpperCase = encoding.toUpperCase(Locale.ENGLISH);
198         // Java's Cp037 encoding is broken, so we have to
199         // provide our own.
200         if (encodingUpperCase.equals("IBM037")
201           || encodingUpperCase.equals("CP037")
202           || encodingUpperCase.equals("EBCDIC-CP-US")
203           || encodingUpperCase.equals("EBCDIC-CP-CA")
204           || encodingUpperCase.equals("EBCDIC-CP-WA")
205           || encodingUpperCase.equals("EBCDIC-CP-NL")
206           || encodingUpperCase.equals("CSIBM037")) {
207             writer = new EBCDICWriter(out);
208         }
209         else if (encodingUpperCase.equals("UTF-16") 
210           || encodingUpperCase.equals("ISO-10646-UCS-2")) {
211            // For compatibility with Java 1.2 and earlier
212            writer = new OutputStreamWriter  (out, "UnicodeBig");  
213         }
214         else if (encodingUpperCase.equals("ISO-8859-11") 
215           || encodingUpperCase.equals("TIS-620")) {
216            // Java doesn't recognize the name ISO-8859-11 and 
217            // Java 1.3 and earlier don't recognize TIS-620
218            writer = new OutputStreamWriter  (out, "TIS620");  
219         }
220         else writer = new OutputStreamWriter  (out, encoding);
221         writer = new BufferedWriter  (writer);
222         this.escaper = TextWriterFactory.getTextWriter(writer, encoding);
223         
224     }
225 
226     
227     /**
228      * <p>
229      * Serializes a document onto the output 
230      * stream using the current options.
231      * </p>
232      * 
233      * @param doc the <code>Document</code> to serialize
234      * 
235      * @throws IOException if the underlying output stream
236      *      encounters an I/O error
237      * @throws NullPointerException if <code>doc</code> is null
238      * @throws UnavailableCharacterException if the document contains 
239      *     an unescapable character (e.g. in an element name) that is 
240      *     not available in the current encoding
241      */
242     public void write(Document doc) throws IOException   {
243         
244         escaper.reset();
245         // The OutputStreamWriter automatically inserts
246         // the byte order mark if necessary.
247         writeXMLDeclaration();
248         int childCount = doc.getChildCount();
249         for (int i = 0; i < childCount; i++) {
250             writeChild(doc.getChild(i)); 
251             
252             // Might want to remove this line break in a 
253             // non-XML serializer where it's not guaranteed to be 
254             // OK to add extra line breaks in the prolog
255             escaper.breakLine();
256         }       
257         escaper.flush();
258         
259     }
260 
261 
262     /**
263      * <p>
264      * Writes the XML declaration onto the output stream,
265      * followed by a line break.
266      * </p>
267      * 
268      * @throws IOException if the underlying output stream
269      *      encounters an I/O error
270      */
271     protected void writeXMLDeclaration() throws IOException   {
272         
273         escaper.writeMarkup("<?xml version=\"1.0\" encoding=\"");
274         escaper.writeMarkup(escaper.getEncoding());
275         escaper.writeMarkup("\"?>");
276         escaper.breakLine();
277         
278     }
279     
280     
281     /**
282      * <p>
283      * Serializes an element onto the output stream using the current
284      * options. The result is guaranteed to be well-formed. If 
285      * <code>element</code> does not have a parent element, the output  
286      * will also be namespace well-formed.
287      * </p>
288      * 
289      * <p>
290      *   If the element is empty, this method invokes 
291      *   <code>writeEmptyElementTag</code>. If the element is not 
292      *   empty, then: 
293      * </p>
294      * 
295      * <ol>
296      *   <li>It calls <code>writeStartTag</code></li>
297      *   <li>It passes each of the element's children to 
298      *       <code>writeChild</code> in order.</li>
299      *   <li>It calls <code>writeEndTag</code></li>
300      * </ol>
301      * 
302      * <p>
303      *   It may break lines or add white space if the serializer has
304      *   been configured to indent or use a maximum line length.
305      * </p>
306      * 
307      * @param element the <code>Element</code> to serialize
308      * 
309      * @throws IOException if the underlying output stream
310      *     encounters an I/O error
311      * @throws UnavailableCharacterException if the element name   
312      *     contains a character that is not available in the 
313      *     current encoding
314      */
315     protected void write(Element element) throws IOException   {
316 
317         if (escaper.isIndenting() 
318           && !escaper.isPreserveSpace() 
319           && !escaper.justBroke()) {
320             escaper.breakLine();
321         }
322         
323         // workaround for case where only children are empty text nodes
324         boolean hasRealChildren = false;
325         int childCount = element.getChildCount();
326         for (int i = 0; i < childCount; i++) {
327             Node child = element.getChild(i);
328             if (child.isText()) {
329                 Text t = (Text) child;
330                 if (t.isEmpty()) continue;
331             }
332             hasRealChildren = true;
333             break;
334         }
335         
336         if (hasRealChildren) {
337             writeStartTag(element);
338             // adjust for xml:space
339             boolean wasPreservingWhiteSpace = escaper.isPreserveSpace();
340             String   newXMLSpaceValue = element.getAttributeValue(
341               "space", "http://www.w3.org/XML/1998/namespace");
342             if  (newXMLSpaceValue != null) {
343                 if ("preserve".equals(newXMLSpaceValue)){
344                     escaper.setPreserveSpace(true);
345                 }
346                 else if ("default".equals(newXMLSpaceValue)){
347                     escaper.setPreserveSpace(false);
348                 }
349             }
350             
351             escaper.incrementIndent();
352             // children
353             for (int i = 0; i < childCount; i++) {
354                 writeChild(element.getChild(i)); 
355             }
356             escaper.decrementIndent();
357             if (escaper.getIndent() > 0 && !escaper.isPreserveSpace()) {
358                 if (hasNonTextChildren(element)) {
359                     escaper.breakLine();
360                 }
361             }
362             writeEndTag(element);
363             
364             // restore parent value
365             if  (newXMLSpaceValue != null) {
366                 escaper.setPreserveSpace(wasPreservingWhiteSpace);
367             }
368                         
369         }
370         else {
371             writeEmptyElementTag(element);   
372         }
373         escaper.flush();
374         
375     }
376 
377     
378     private boolean hasNonTextChildren(Element element) {
379         
380         int childCount = element.getChildCount();
381         for (int i = 0; i < childCount; i++) {
382             if (! element.getChild(i).isText()) return true;  
383         }
384         return false;
385         
386     }
387 
388 
389     // writeEndTag should not normally throw UnavailableCharacterException 
390     // because that would already have been thrown for the
391     // corresponding start-tag.
392     /**
393      * <p>
394      *   Writes the end-tag for an element in the form
395      *   <code>&lt;/<i>name</i>&gt;</code>.
396      * </p>
397      * 
398      * @param element the element whose end-tag is written
399      * 
400      * @throws IOException if the underlying output stream
401      *     encounters an I/O error
402      */
403     protected void writeEndTag(Element element) throws IOException   {
404         escaper.writeMarkup("</");
405         escaper.writeMarkup(element.getQualifiedName());
406         escaper.writeMarkup(">");
407     }
408 
409     
410     /**
411      * 
412      * <p>
413      *  Writes the start-tag for the element including
414      *  all its namespace declarations and attributes.
415      * </p>
416      * 
417      * <p>
418      *   The <code>writeAttributes</code> method is called to write
419      *   all the non-namespace-declaration attributes. 
420      *   The <code>writeNamespaceDeclarations</code> method
421      *   is called to write all the namespace declaration attributes.
422      * </p>
423      * 
424      * @param element the element whose start-tag is written
425      * 
426      * @throws IOException if the underlying output stream
427      *     encounters an I/O error
428      * @throws UnavailableCharacterException if the name of the element
429      *     or the name of any of its attributes contains a character  
430      *     that is not available in the current encoding
431      */
432     protected void writeStartTag(Element element) throws IOException   {
433         writeTagBeginning(element);
434         escaper.writeMarkup('>');
435     }
436 
437     
438     /**
439      * 
440      * <p>
441      *  Writes an empty-element tag for the element 
442      *  including all its namespace declarations and attributes.
443      * </p>
444      * 
445      * <p>
446      *   The <code>writeAttributes</code> method is called to write
447      *   all the non-namespace-declaration attributes. 
448      *   The <code>writeNamespaceDeclarations</code> method
449      *   is called to write all the namespace declaration attributes.
450      * </p>
451      * 
452      * <p>
453      *   If subclasses don't wish empty-element tags to be used,
454      *   they can override this method to simply invoke 
455      *   <code>writeStartTag</code> followed by 
456      *   <code>writeEndTag</code>.
457      * </p>
458      * 
459      * @param element the element whose empty-element tag is written
460      * 
461      * @throws IOException if the underlying output stream
462      *     encounters an I/O error
463      * @throws UnavailableCharacterException if the name of the element or the name of
464      *     any of its attributes contains a character that is not 
465      *     available in the current encoding
466      */
467     protected void writeEmptyElementTag(Element element) 
468       throws IOException   {
469         writeTagBeginning(element);
470         escaper.writeMarkup("/>");
471     }
472 
473     
474     // This just extracts the commonality between writeStartTag  
475     // and writeEmptyElementTag
476     private void writeTagBeginning(Element element) 
477       throws IOException   {
478         escaper.writeMarkup('<');
479         escaper.writeMarkup(element.getQualifiedName());
480         writeAttributes(element);           
481         writeNamespaceDeclarations(element);
482     }
483 
484 
485     /**
486      * <p>
487      *   Writes all the attributes of the specified
488      *   element onto the output stream, one at a time, separated
489      *   by white space. If preserveBaseURI is true, and it is
490      *   necessary to add an <code>xml:base</code> attribute
491      *   to the element in order to preserve the base URI, then 
492      *   that attribute is also written here.
493      *   Each individual attribute is written by invoking
494      *   <code>write(Attribute)</code>.
495      * </p>
496      * 
497      * @param element the <code>Element</code> whose attributes are 
498      *     written
499      * @throws IOException if the underlying output stream
500      *     encounters an I/O error
501      * @throws UnavailableCharacterException if the name of any of
502      *     the element's attributes contains a character that is not 
503      *     available in the current encoding
504      */
505     protected void writeAttributes(Element element)
506       throws IOException   {
507           
508         // check to see if we need an xml:base attribute
509         if (preserveBaseURI) {
510             ParentNode parent = element.getParent();
511             if (element.getAttribute("base", 
512               "http://www.w3.org/XML/1998/namespace") == null) {
513                 String   baseValue = element.getBaseURI();
514                 if (parent == null 
515                   || parent.isDocument()
516                   || !element.getBaseURI()
517                        .equals(parent.getBaseURI())) {
518                        
519                     escaper.writeMarkup(' ');
520                     Attribute baseAttribute = new Attribute(
521                       "xml:base", 
522                       "http://www.w3.org/XML/1998/namespace", 
523                       baseValue);
524                     write(baseAttribute);
525                 }
526             }
527         }
528         
529         int attributeCount = element.getAttributeCount();
530         for (int i = 0; i < attributeCount; i++) {
531             Attribute attribute = element.getAttribute(i);
532             escaper.writeMarkup(' ');
533             write(attribute);
534         }  
535     }
536 
537     
538     /**
539      * <p>
540      *   Writes all the namespace declaration
541      *   attributes of the specified element onto the output stream,
542      *   one at a time, separated by white space. Each individual 
543      *   declaration is written by invoking 
544      *   <code>writeNamespaceDeclaration</code>.
545      * </p>
546      * 
547      * @param element the <code>Element</code> whose attributes are 
548      *     written
549      * @throws IOException if the underlying output stream
550      *     encounters an I/O error
551      * @throws UnavailableCharacterException if any of the element's namespace prefixes
552      *     contains a character that is not available in the current 
553      *     encoding
554      */
555     protected void writeNamespaceDeclarations(Element element)
556       throws IOException   {
557         
558         ParentNode parent = element.getParent();
559         int count = element.getNamespaceDeclarationCount();
560         for (int i = 0; i < count; i++) {
561             String   additionalPrefix = element.getNamespacePrefix(i);
562             String   uri = element.getNamespaceURI(additionalPrefix);
563             if (parent.isElement()) {
564                Element parentElement = (Element) parent;   
565                if (uri.equals(
566                  parentElement.getNamespaceURI(additionalPrefix))) {
567                    continue;
568                } 
569             }
570             else if (uri.equals("")) {
571                 continue; // no need to say xmlns=""   
572             }
573             
574             escaper.writeMarkup(' ');
575             writeNamespaceDeclaration(additionalPrefix, uri);
576         } 
577     }
578 
579 
580     /**
581      * <p>
582      *   Writes a namespace declaration in the form
583      *   <code>xmlns:<i>prefix</i>="<i>uri</i>"</code> or 
584      *   <code>xmlns="<i>uri</i>"</code>. It does not write
585      *   the spaces on either side of the namespace declaration.
586      *   These are written by <code>writeNamespaceDeclarations</code>.
587      * </p>
588      * 
589      * @param prefix the namespace prefix; the empty string for the
590      *     default namespace
591      * @param uri the namespace URI
592      * 
593      * @throws IOException if the underlying output stream
594      *     encounters an I/O error
595      * @throws UnavailableCharacterException if the namespace prefix contains a 
596      *     character that is not available in the current encoding
597      */
598     protected void writeNamespaceDeclaration(String   prefix, String   uri)
599       throws IOException   {
600         
601         if ("".equals(prefix)) {
602             escaper.writeMarkup("xmlns"); 
603         }
604         else {
605             escaper.writeMarkup("xmlns:"); 
606             escaper.writeMarkup(prefix); 
607         } 
608         escaper.writeMarkup("=\""); 
609         escaper.writePCDATA(uri);   
610         escaper.writeMarkup('\"');
611         
612     }
613 
614     
615     /**
616      * <p>
617      *   Writes an attribute in the form 
618      *   <code><i>name</i>="<i>value</i>"</code>.
619      *   Characters in the attribute value are escaped as necessary.
620      * </p>
621      * 
622      * @param attribute the <code>Attribute</code> to write
623      * 
624      * @throws IOException if the underlying output stream
625      *     encounters an I/O error
626      * @throws UnavailableCharacterException if the attribute name contains a character 
627      *     that is not available in the current encoding
628      * 
629      */
630     protected void write(Attribute attribute) throws IOException   {
631         escaper.writeMarkup(attribute.getQualifiedName());
632         escaper.writeMarkup("=\"");
633         escaper.writeAttributeValue(attribute.getValue());
634         escaper.writeMarkup('\"');  
635     }
636     
637     
638     /**
639      * <p>
640      * Writes a comment onto the output stream using the current 
641      * options. Since character and entity references are not resolved
642      * in comments, comments can only be serialized when all
643      * characters they contain are available in the current 
644      * encoding.
645      * </p>
646      * 
647      * @param comment the <code>Comment</code> to serialize
648      * 
649      * @throws IOException if the underlying output stream 
650      *     encounters an I/O error
651      * @throws UnavailableCharacterException if the comment contains a 
652      *     character that is not available in the current encoding
653      */
654     protected void write(Comment comment) throws IOException   {
655         if (escaper.isIndenting()) escaper.breakLine();
656         escaper.writeMarkup("<!--");
657         escaper.writeMarkup(comment.getValue());
658         escaper.writeMarkup("-->");
659     }
660     
661     
662     /**
663      * <p>
664      * Writes a processing instruction
665      * onto the output stream using the current options.
666      * Since character and entity references are not resolved
667      * in processing instructions, processing instructions
668      * can only be serialized when all
669      * characters they contain are available in the current 
670      * encoding.
671      * </p>
672      * 
673      * @param instruction the <code>ProcessingInstruction</code> 
674      *     to serialize
675      * 
676      * @throws IOException if the underlying output stream
677      *     encounters an I/O error
678      * @throws UnavailableCharacterException if the comment contains a 
679      *     character that is not available in the current encoding
680      */
681     protected void write(ProcessingInstruction instruction) 
682       throws IOException   {
683         
684         if (escaper.isIndenting()) escaper.breakLine();
685         escaper.writeMarkup("<?");
686         escaper.writeMarkup(instruction.getTarget());
687         String   value = instruction.getValue();
688         // for canonical XML, only output a space after the target
689         // if there is a value
690         if (!"".equals(value)) {
691             escaper.writeMarkup(' ');
692             escaper.writeMarkup(value);
693         }
694         escaper.writeMarkup("?>"); 
695         
696     }
697     
698     /**
699      * <p>
700      * Writes a <code>Text</code> object
701      * onto the output stream using the current options.
702      * Reserved characters such as &lt;, &gt; and "
703      * are escaped using the standard entity references 
704      * such as <code>&amp;lt;</code>, <code>&amp;gt;</code>, 
705      * and <code>&amp;quot;</code>.
706      * </p>
707      * 
708      * <p>
709      * Characters which cannot be encoded in the current character set
710      * (for example, &Omega; in ISO-8859-1) are encoded using 
711      * character references. 
712      * </p> 
713      * 
714      * @param text the <code>Text</code> to serialize
715      * 
716      * @throws IOException if the underlying output stream
717      *     encounters an I/O error
718      */
719     protected void write(Text text) throws IOException   {
720         
721         // XXX Is there a shortcut that takes advantage of the
722         // data being stored in UTF-8 here? perhaps even if only
723         // when serializing to UTF-8?
724         String   value = text.getValue();
725         if (text.isCDATASection() 
726           && value.indexOf("]]>") == -1) {
727             if (!(escaper instanceof UnicodeWriter)) {
728                 int length = value.length();
729                 for (int i = 0; i < length; i++) {
730                    if (escaper.needsEscaping(value.charAt(i))) {
731                         // can't use CDATA section
732                         escaper.writePCDATA(value);
733                         return;   
734                    }   
735                 }
736             }
737             escaper.writeMarkup("<![CDATA[");
738             escaper.writeMarkup(value);
739             escaper.writeMarkup("]]>");
740         }
741         // is this boundary whitespace we can ignore?
742         else if (isBoundaryWhitespace(text)) {
743             return; // without writing node
744         }
745         else {
746             escaper.writePCDATA(value);
747         }
748         
749     }  
750     
751     
752     private boolean isBoundaryWhitespace(Text text) {
753         
754         if (getIndent() <= 0) return false;
755         
756         // XXX check this without getValue
757         if (!"".equals(text.getValue().trim())) return false;
758         ParentNode parent = text.getParent();
759         
760         int position = parent.indexOf(text);
761         
762         if (position == 0 && parent.getChildCount() == 1) return false;
763         Node previous = null;
764         Node next = null;
765         if (position != 0) previous = parent.getChild(position-1);
766         if (position != parent.getChildCount()-1) {
767             next = parent.getChild(position+1);
768         }
769         if (previous == null || !previous.isText()) {
770             if (next == null || !next.isText()) {
771                 return true;
772             }
773         }
774         
775         return false;
776         
777     }
778 
779     
780     /**
781      * <p>
782      * Writes a <code>DocType</code> object
783      * onto the output stream using the current options.
784      * </p>
785      * 
786      * @param doctype the document type declaration to serialize
787      * 
788      * @throws IOException if the underlying 
789      *     output stream encounters an I/O error
790      * @throws UnavailableCharacterException if the document type   
791      *     declaration contains a character that is not available 
792      *     in the current encoding
793      */
794     protected void write(DocType doctype) throws IOException   {
795         
796         escaper.writeMarkup("<!DOCTYPE ");
797         escaper.writeMarkup(doctype.getRootElementName());
798         if (doctype.getPublicID() != null) {
799           escaper.writeMarkup(" PUBLIC \"" + doctype.getPublicID() 
800            + "\" \"" + doctype.getSystemID() + "\"");
801         } 
802         else if (doctype.getSystemID() != null) {
803           escaper.writeMarkup(
804             " SYSTEM \"" + doctype.getSystemID() + "\"");
805         } 
806         
807         String   internalDTDSubset = doctype.getInternalDTDSubset();
808         if (!internalDTDSubset.equals("")) {
809             escaper.writeMarkup(" [");    
810             escaper.breakLine();
811             escaper.setInDocType(true);
812             escaper.writeMarkup(internalDTDSubset); 
813             escaper.setInDocType(false);
814             escaper.writeMarkup("]"); 
815         }
816 
817         escaper.writeMarkup(">");
818         
819     }   
820 
821     
822     /**
823      * <p>
824      * Writes a child node onto the output stream using the  
825      * current options. It is invoked when walking the tree to
826      * serialize the entire document. It is not called, and indeed
827      * should not be called, for either the <code>Document</code> 
828      * node or for attributes. 
829      * </p>
830      * 
831      * @param node the <code>Node</code> to serialize
832      * 
833      * @throws IOException if the underlying output stream
834      *     encounters an I/O error
835      * @throws XMLException if an <code>Attribute</code> or a 
836      *     <code>Document</code> is passed to this method
837      */
838     protected void writeChild(Node node) throws IOException   {
839         
840         if (node.isElement()) {
841             write((Element) node);
842         }
843         else if (node.isText()) {
844             write((Text) node);
845         }
846         else if (node.isComment()) {
847             write((Comment) node);
848         }
849         else if (node.isProcessingInstruction()) {
850             write((ProcessingInstruction) node);
851         }
852         else if (node.isDocType()) {
853             write((DocType) node);
854         }
855         else {
856             throw new XMLException("Cannot write a " + 
857               node.getClass().getName() + 
858               " from the writeChildNode() method");
859         }
860         
861     }
862  
863     
864     /** <p>
865      * Writes a string onto the underlying output stream.
866      * Non-ASCII characters that are not available in the
867      * current character set are encoded with numeric character
868      * references. The three reserved characters &lt;, &gt;, and &amp; 
869      * are escaped using the standard entity references 
870      * <code>&amp;lt;</code>, <code>&amp;gt;</code>, 
871      * and <code>&amp;amp;</code>.
872      * Double and single quotes are not escaped.
873      * </p> 
874      * 
875      * @param text the parsed character data to serialize
876      * 
877      * @throws IOException if the underlying output stream 
878      *     encounters an I/O error
879      */
880     protected final void writeEscaped(String   text) throws IOException   {
881         escaper.writePCDATA(text);
882     }   
883  
884     /** <p>
885      *   Writes a string onto the underlying output stream.
886      *   Non-ASCII characters that are not available in the
887      *   current character set are escaped using hexadecimal numeric
888      *   character references. Carriage returns, line feeds, and tabs
889      *   are also escaped using hexadecimal numeric character 
890      *   references in order to ensure their preservation on a round
891      *   trip. The four reserved characters &lt;, &gt;, &amp;,  
892      *   and &quot; are escaped using the standard entity references 
893      *   <code>&amp;lt;</code>, <code>&amp;gt;</code>, 
894      *   <code>&amp;amp;</code>, and <code>&amp;quot;</code>. 
895      *   The single quote is not escaped. 
896      * </p> 
897      * 
898      * @param value the attribute value to serialize
899      * 
900      * @throws IOException if the underlying output stream 
901      *     encounters an I/O error
902      */
903     protected final void writeAttributeValue(String   value)
904       throws IOException   {
905         escaper.writeAttributeValue(value);
906     }   
907  
908     
909     /** <p>
910      *   Writes a string onto the underlying output stream.
911      *   without escaping any characters.
912      *   Non-ASCII characters that are not available in the
913      *   current character set cause an <code>IOException</code>.
914      * </p> 
915      * 
916      * @param text the <code>String</code> to serialize
917      * 
918      * @throws IOException if the underlying output stream
919      *     encounters an I/O error or <code>text</code> contains 
920      *     characters not available in the current character set
921      */
922     protected final void writeRaw(String   text) throws IOException   {
923         escaper.writeMarkup(text);
924     }   
925  
926     
927     /** <p>
928      *   Writes the current line break string
929      *   onto the underlying output stream and indents
930      *   as specified by the current level and the indent property.
931      * </p> 
932      * 
933      * @throws IOException if the underlying output stream 
934      *     encounters an I/O error
935      */
936     protected final void breakLine() throws IOException   {
937         escaper.breakLine();
938     }   
939     
940     
941     /**
942      * <p>
943      * Flushes the data onto the output stream.
944      * It is not enough to flush the output stream.
945      * You must flush the serializer object itself because it
946      * uses some internal buffering.
947      * The serializer will flush the underlying output stream.
948      * </p>
949      * 
950      * @throws IOException  if the underlying  
951      *     output stream encounters an I/O error
952      */
953     public void flush() throws IOException   {
954         escaper.flush();    
955     }
956 
957     
958     /**
959      * <p>
960      * Returns the number of spaces this serializer indents.
961      * </p>
962      * 
963      * @return the number of spaces this serializer indents
964      *     each successive level beyond the previous one
965      */
966     public int getIndent() {
967         return escaper.getIndent();
968     }
969 
970 
971     /**
972      * <p>
973      * Sets the number of additional spaces to add to each successive
974      * level in the hierarchy. Use 0 for no extra indenting. The 
975      * maximum indentation is in limited to approximately half the
976      * maximum line length. The serializer will not indent further 
977      * than that no matter how many levels deep the hierarchy is.
978      * </p>
979      * 
980      * <p>
981      *   When this variable is set to a value greater than 0,
982      *   the serializer does not preserve white space. Spaces,
983      *   tabs, carriage returns, and line feeds can all be 
984      *   interchanged at the serializer's discretion, and additional
985      *   white space may be added before and after tags.
986      *   Carriage returns, line feeds, and tabs will not be 
987      *   escaped with numeric character references.
988      * </p>
989      * 
990      * <p>
991      *   Inside elements with an <code>xml:space="preserve"</code> 
992      *   attribute, white space is preserved and no indenting 
993      *   takes place, regardless of the setting of the indent
994      *   property, unless, of course, an 
995      *   <code>xml:space="default"</code> attribute overrides the
996      *   <code>xml:space="preserve"</code> attribute.
997      * </p>
998      * 
999      * <p>
1000     *   The default value for indent is 0; that is, the default is
1001     *   not to add or subtract any white space from the source
1002     *   document.  
1003     * </p>
1004     * 
1005     * @param indent the number of spaces to indent 
1006     *      each successive level of the hierarchy
1007     * 
1008     * @throws IllegalArgumentException if indent is less than zero
1009     * 
1010     */
1011    public void setIndent(int indent) {
1012        if (indent < 0) {
1013            throw new IllegalArgumentException  (
1014              "Indent cannot be negative"
1015            );
1016        }
1017        escaper.setIndent(indent);
1018    }
1019
1020    
1021    /**
1022     * <p>
1023     * Returns the string used as a line separator.
1024     * This is always <code>"\n"</code>, <code>"\r"</code>, 
1025     * or <code>"\r\n"</code>.
1026     * </p>
1027     * 
1028     * @return the line separator
1029     */
1030    public String   getLineSeparator() {
1031        return escaper.getLineSeparator();
1032    }
1033
1034    
1035    /**
1036     * <p>
1037     * Sets the line separator. This can only be one of the 
1038     * three strings <code>"\n"</code>, <code>"\r"</code>, 
1039     * or <code>"\r\n"</code>. All other values are forbidden.
1040     * If this method is invoked, then 
1041     * line separators in the character data will be changed to this
1042     * string. Line separators in attribute values will be changed
1043     * to the hexadecimal numeric character references corresponding
1044     * to this string.
1045     * </p>
1046     * 
1047     * <p>
1048     *  The default line separator is <code>"\r\n"</code>. However, 
1049     *  line separators in character data and attribute values are not 
1050     *  changed to this string, unless this method is called first.
1051     * </p>
1052     * 
1053     * @param lineSeparator the line separator to set
1054     * 
1055     * @throws IllegalArgumentException if you attempt to use any line
1056     *    separator other than <code>"\n"</code>, <code>"\r"</code>, 
1057     *    or <code>"\r\n"</code>.
1058     * 
1059     */
1060    public void setLineSeparator(String   lineSeparator) {
1061        escaper.setLineSeparator(lineSeparator);  
1062    }
1063
1064    
1065    /**
1066     * <p>
1067     * Returns the preferred maximum line length.
1068     * </p>
1069     * 
1070     * @return the preferred maximum line length.
1071     */
1072    public int getMaxLength() {
1073        return escaper.getMaxLength();
1074    }
1075
1076    
1077    /**
1078     * <p>
1079     * Sets the suggested maximum line length for this serializer.
1080     * Setting this to 0 indicates that no automatic wrapping is to be
1081     * performed. When a line approaches this length, the serializer 
1082     * begins looking for opportunities to break the line. Generally 
1083     * it will break on any ASCII white space character (tab, carriage 
1084     * return, linefeed, and space). In some circumstances the 
1085     * serializer may not be able to break the line before the maximum
1086     * length is reached. For instance, if an element name is longer 
1087     * than the maximum line length the only way to correctly 
1088     * serialize it is to exceed the maximum line length. In this case,
1089     *  the serializer will exceed the maximum line length.
1090     * </p>
1091     * 
1092     * <p>
1093     * The default value for maximum line length is 0, which is  
1094     * interpreted as no maximum line length. 
1095     * Setting this to a negative value just sets it to 0. 
1096     * </p>
1097     * 
1098     * <p>
1099     *   When this variable is set to a value greater than 0,
1100     *   the serializer does not preserve white space. Spaces,
1101     *   tabs, carriage returns, and line feeds can all be 
1102     *   interchanged at the serializer's discretion.
1103     *   Carriage returns, line feeds, and tabs will not be 
1104     *   escaped with numeric character references.
1105     * </p>
1106     * 
1107     * <p>
1108     *   Inside elements with an <code>xml:space="preserve"</code> 
1109     *   attribute, the maximum line length is not enforced, 
1110     *   regardless of the setting of the this property, unless,  
1111     *   of course, an <code>xml:space="default"</code> attribute 
1112     *   overrides the <code>xml:space="preserve"</code> attribute.
1113     * </p>
1114     * 
1115     * @param maxLength the preferred maximum line length
1116     */
1117    public void setMaxLength(int maxLength) {
1118        escaper.setMaxLength(maxLength);
1119    }
1120
1121    
1122    /**
1123     * <p>
1124     * Returns true if this serializer preserves the original
1125     * base URIs by inserting extra <code>xml:base</code> attributes.
1126     * </p>
1127     * 
1128     * @return true if this <code>Serializer</code> inserts
1129     *    extra <code>xml:base</code> attributes to attempt to 
1130     *    preserve base URI information from the document.
1131     */
1132    public boolean getPreserveBaseURI() {
1133        return preserveBaseURI;
1134    }
1135
1136    
1137    /**
1138     * <p>
1139     * Determines whether this serializer inserts
1140     * extra <code>xml:base</code> attributes to attempt to 
1141     * preserve base URI information from the document.
1142     * The default is false, do not preserve base URI information.
1143     * <code>xml:base</code> attributes that have been explicitly
1144     * added to an element are always output. This property only  
1145     * determines whether or not extra <code>xml:base</code> 
1146     * attributes are added.
1147     * </p>
1148     * 
1149     * @param preserve true if <code>xml:base</code> 
1150     *     attributes should be added as necessary
1151     *     to preserve base URI information 
1152     */
1153    public void setPreserveBaseURI(boolean preserve) {
1154        this.preserveBaseURI = preserve;
1155    }
1156    
1157    
1158    /**
1159     * <p>
1160     *   Returns the name of the character encoding used by 
1161     *   this serializer.
1162     * </p>
1163     * 
1164     * @return the encoding used for the output document
1165     */
1166    public String   getEncoding() {
1167        return escaper.getEncoding();   
1168    }
1169    
1170    /**
1171     * <p>
1172     *   If true, this property indicates serialization will
1173     *   perform Unicode normalization on all data using normalization
1174     *   form C (NFC). Performing Unicode normalization may change the
1175     *   document's infoset. The default is false; do not normalize.
1176     * </p>
1177     * 
1178     * <p>
1179     *   The implementation used is IBM's <a target="_top" HREF=
1180     *   "http://oss.software.ibm.com/icu4j/index.html">International
1181     *   Components for Unicode <i>for Java</i> (ICU4J) 2.6</a>. 
1182     *   This version is based on Unicode 4.0. 
1183     * </p>
1184     * 
1185     * <p>
1186     *   This feature has not yet been benchmarked or optimized.
1187     *   It may result in substantially slower code. 
1188     * </p>
1189     * 
1190     * <p>
1191     *   If all your data is in the first 256 code points of Unicode
1192     *   (i.e. the ISO-8859-1, Latin-1 character set), then it's 
1193     *   already in normalization form C and normalizing won't change
1194     *   anything.
1195     * </p>
1196     * 
1197     * @param normalize true if normalization is performed; 
1198     *     false if it isn't
1199     */
1200    public void setUnicodeNormalizationFormC(boolean normalize) {
1201        escaper.setNFC(normalize);   
1202    }
1203
1204    
1205    /**
1206     * <p>
1207     *   Indicates whether serialization will
1208     *   perform Unicode normalization on all data using normalization
1209     *   form C (NFC). The default is false; do not normalize.
1210     * </p>
1211     * 
1212     * @return true if this serializer performs Unicode 
1213     *     normalization; false if it doesn't
1214     */
1215    public boolean getUnicodeNormalizationFormC() {
1216        return escaper.getNFC();   
1217    }
1218    
1219    
1220    /**
1221     * <p>
1222     *   Returns the current column number of the output stream. This 
1223     *   method useful for subclasses that implement their own pretty
1224     *   printing strategies by inserting white space and line breaks 
1225     *   at appropriate points.
1226     * </p>
1227     * 
1228     * <p>
1229     *   Columns are counted based on Unicode characters, not Java
1230     *   chars. A surrogate pair counts as one character in this 
1231     *   context, not two. However, a character followed by a 
1232     *   combining character (e.g. e followed by combining accent
1233     *   acute) counts as two characters. This latter choice
1234     *   (treating combining characters like regular characters)
1235     *   is under review, and may change in the future if it's not
1236     *   too big a performance hit.
1237     * </p>
1238     * 
1239     * @return the current column number
1240     */
1241    protected final int getColumnNumber() {
1242        return escaper.getColumnNumber();
1243    }
1244    
1245}
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags