Canonicalizer


1   /* Copyright 2002-2005 Elliotte Rusty Harold
2      
3      This library is free software; you can redistribute it and/or modify
4      it under the terms of version 2.1 of the GNU Lesser General Public 
5      License as published by the Free Software Foundation.
6      
7      This library is distributed in the hope that it will be useful,
8      but WITHOUT ANY WARRANTY; without even the implied warranty of
9      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
10     GNU Lesser General Public License for more details.
11     
12     You should have received a copy of the GNU Lesser General Public
13     License along with this library; if not, write to the 
14     Free Software Foundation, Inc., 59 Temple Place, Suite 330, 
15     Boston, MA 02111-1307  USA
16     
17     You can contact Elliotte Rusty Harold by sending e-mail to
18     elharo@metalab.unc.edu. Please include the word "XOM" in the
19     subject line. The XOM home page is located at http://www.xom.nu/
20  */
21  
22  package nu.xom.canonical;
23  
24  import java.io.IOException  ;
25  import java.io.OutputStream  ;
26  import java.util.Arrays  ;
27  import java.util.Comparator  ;
28  
29  import nu.xom.Attribute;
30  import nu.xom.Comment;
31  import nu.xom.DocType;
32  import nu.xom.Document;
33  import nu.xom.Element;
34  import nu.xom.Node;
35  import nu.xom.ParentNode;
36  import nu.xom.ProcessingInstruction;
37  import nu.xom.Serializer;
38  import nu.xom.Text;
39  
40  /**
41   * <p>
42   *   Writes XML in the format specified by <a target="_top"
43   *   HREF="http://www.w3.org/TR/2001/REC-xml-c14n-20010315">Canonical
44   *   XML Version 1.0</a>.
45   * </p>
46   *
47   * <p>
48   *   Only complete documents can be canonicalized.
49   *   Document subset canonicalization is not yet supported.
50   * </p>
51   * 
52   * @author Elliotte Rusty Harold
53   * @version 1.0
54   *
55   */
56  public class Canonicalizer {
57  
58      private boolean withComments;
59      private Serializer serializer;
60      
61      private static Comparator   comparator = new AttributeComparator();
62      
63      private static class AttributeComparator implements Comparator   {
64          
65          public int compare(Object   o1, Object   o2) {
66              Attribute a1 = (Attribute) o1;   
67              Attribute a2 = (Attribute) o2;   
68              
69              String   namespace1 = a1.getNamespaceURI();
70              String   namespace2 = a2.getNamespaceURI();
71              if (namespace1.equals(namespace2)) { 
72                  return a1.getLocalName().compareTo(a2.getLocalName());             
73              }
74              else if (namespace1.equals("")) {
75                   return -1;   
76              }
77              else if (namespace2.equals("")) {
78                   return 1;   
79              }
80              else { // compare namespace URIs
81                  return namespace1.compareTo(namespace2);               
82              }
83              
84          }
85  
86      }
87      
88      
89      /**
90       * <p>
91       *   Creates a <code>Canonicalizer</code> that outputs a 
92       *   canonical XML document with comments.
93       * </p>
94       * 
95       * @param out the output stream the document
96       *     is written onto
97       */
98      public Canonicalizer(OutputStream   out) {
99          this(out, true);
100     }
101 
102     
103     /**
104      * <p>
105      *   Creates a <code>Canonicalizer</code> that outputs a 
106      *   canonical XML document with or without comments.
107      * </p>
108      * 
109      * @param out the output stream the document
110      *     is written onto
111      * @param withComments true if comments should be included 
112      *     in the output, false otherwise
113      */
114     public Canonicalizer(
115       OutputStream   out, boolean withComments) {
116         this.serializer = new CanonicalXMLSerializer(out);
117         serializer.setLineSeparator("\n");
118         this.withComments = withComments;
119     }
120 
121 
122     private class CanonicalXMLSerializer extends Serializer {
123 
124         /**
125          * <p>
126          *   Creates a <code>Serializer</code> that outputs a 
127          *   canonical XML document with or without comments.
128          * </p>
129          * 
130          * @param out the <code>OutputStream</code> the document
131          *     is written onto
132          * @param withComments true if comments should be included 
133          *     in the output, false otherwise
134          */
135         CanonicalXMLSerializer(OutputStream   out) {
136             super(out);
137             setLineSeparator("\n");
138         }
139 
140         
141         /**
142          * <p>
143          * Serializes a document onto the output 
144          * stream using the canonical XML algorithm.
145          * </p>
146          * 
147          * @param doc the <code>Document</code> to serialize
148          * 
149          * @throws IOException if the underlying <code>OutputStream</code>
150          *      encounters an I/O error
151          */
152          public final void write(Document doc) throws IOException   {
153             
154             int position = 0;        
155             while (true) {
156                 Node child = doc.getChild(position);
157                 writeChild(child); 
158                 position++;
159                 if (child instanceof ProcessingInstruction) breakLine();
160                 else if (child instanceof Comment && withComments) {
161                     breakLine();
162                 }
163                 else if (child instanceof Element) break;
164             }       
165             
166             for (int i = position; i < doc.getChildCount(); i++) {
167                 Node child = doc.getChild(i);
168                 if (child instanceof ProcessingInstruction) breakLine();
169                 else if (child instanceof Comment && withComments) {
170                     breakLine();
171                 }
172                 writeChild(child);
173             }
174             
175             flush();
176         }  
177      
178          
179         /**
180          * <p>
181          * Serializes an element onto the output stream using the canonical
182          * XML algorithm.  The result is guaranteed to be well-formed. 
183          * If <code>element</code> does not have a parent element, it will
184          * also be namespace well-formed.
185          * </p>
186          * 
187          * @param element the <code>Element</code> to serialize
188          * 
189          * @throws IOException if the underlying <code>OutputStream</code>
190          *     encounters an I/O error
191          */
192         protected final void write(Element element) 
193           throws IOException   {
194 
195             // treat empty elements differently to avoid an
196             // instance of test
197             if (element.getChildCount() == 0) {
198                 writeStartTag(element, false);
199                 writeEndTag(element);                
200             }
201             else {
202                 Node current = element;
203                 boolean end = false;
204                 int index = -1;
205                 int[] indexes = new int[10];
206                 int top = 0;
207                 indexes[0] = -1;
208                 while (true) {                   
209                     if (!end && current.getChildCount() > 0) {
210                        writeStartTag((Element) current, false);
211                        current = current.getChild(0);
212                        index = 0;
213                        top++;
214                        indexes = grow(indexes, top);
215                        indexes[top] = 0;
216                     }
217                     else {
218                         if (end) {
219                             writeEndTag((Element) current);
220                             if (current == element) break;
221                         }
222                         else {
223                             writeChild(current);
224                         }
225                         end = false;
226                         ParentNode parent = current.getParent();
227                         if (parent.getChildCount() - 1 == index) {
228                             current = parent;
229                             top--;
230                             if (current != element) {
231                                 parent = current.getParent();
232                                 index = indexes[top];
233                             }
234                             end = true;
235                         }
236                         else {
237                             index++;
238                             indexes[top] = index;
239                             current = parent.getChild(index);
240                         }
241                     }
242                 }   
243             }
244             
245         } 
246     
247         
248         private int[] grow(int[] indexes, int top) {
249             
250             if (top < indexes.length) return indexes;
251             int[] result = new int[indexes.length*2];
252             System.arraycopy(indexes, 0, result, 0, indexes.length);
253             return result;
254             
255         }
256 
257 
258         protected void writeStartTag(Element element, boolean isEmpty) 
259           throws IOException   {
260             writeRaw("<");
261             writeRaw(element.getQualifiedName());
262             
263             ParentNode parent = element.getParent();
264             
265             Element parentElement = null;
266             if (parent instanceof Element) {
267                 parentElement = (Element) parent; 
268             } 
269             
270             for (int i = 0; 
271                  i < element.getNamespaceDeclarationCount(); 
272                  i++) {
273                 String   prefix = element.getNamespacePrefix(i);
274                 String   uri = element.getNamespaceURI(prefix);
275                 if (parentElement != null) {
276                    if (uri.equals(
277                      parentElement.getNamespaceURI(prefix))) {
278                        continue; 
279                    }
280                 }
281                 else if (uri.equals("")) {
282                     continue; // no need to say xmlns=""   
283                 }
284                 
285                 writeRaw(" ");
286                 writeNamespaceDeclaration(prefix, uri);
287             } 
288             
289             Attribute[] sorted = sortAttributes(element);        
290             for (int i = 0; i < sorted.length; i++) {
291                 writeRaw(" ");
292                 write(sorted[i]);
293             }       
294             
295             writeRaw(">");
296         } 
297     
298         
299         protected void write(Attribute attribute) throws IOException   {
300             writeRaw(attribute.getQualifiedName());
301             writeRaw("=\"");
302             writeRaw(prepareAttributeValue(attribute));
303             writeRaw("\"");
304         }
305         
306         
307         protected void writeEndTag(Element element) throws IOException   {
308             writeRaw("</");
309             writeRaw(element.getQualifiedName());
310             writeRaw(">");
311         }    
312         
313         
314         private Attribute[] sortAttributes(Element element) {
315     
316             Attribute[] result 
317               = new Attribute[element.getAttributeCount()];
318             for (int i = 0; i < element.getAttributeCount(); i++) {
319                 result[i] = element.getAttribute(i); 
320             }
321             Arrays.sort(result, comparator);       
322             
323             return result;        
324             
325         }
326     
327         
328         private String   prepareAttributeValue(Attribute attribute) {
329     
330             String   value = attribute.getValue();
331             StringBuffer   result = new StringBuffer  (value.length());
332     
333             if (attribute.getType().equals(Attribute.Type.CDATA)
334               || attribute.getType().equals(Attribute.Type.UNDECLARED)) {
335                 char[] data = value.toCharArray();
336                 for (int i = 0; i < data.length; i++) {
337                     char c = data[i];
338                     if (c == '\t') {
339                         result.append("&#x9;");
340                     }
341                     else if (c == '\n') {
342                         result.append("&#xA;");
343                     }
344                     else if (c == '\r') {
345                         result.append("&#xD;");
346                     }
347                     else if (c == '\"') {
348                         result.append("&quot;");
349                     }
350                     else if (c == '&') {
351                         result.append("&amp;");
352                     }
353                     else if (c == '<') {
354                         result.append("&lt;");
355                     }
356                     else { 
357                         result.append(c);   
358                     }
359                 }
360             }
361             else {
362                 // According to the spec, "Whitespace character references
363                 // other than &#x20; are not affected by attribute value 
364                 // normalization. For parsed documents, the parser will  
365                 // still replace these with the actual character. I am 
366                 // going to assume that if one is found here, that the 
367                 // user meant to put it there; and so we will escape it 
368                 // with a character reference
369                 char[] data = value.toCharArray();
370                 boolean seenFirstNonSpace = false;
371                 for (int i = 0; i < data.length; i++) {
372                     if (data[i] == ' ') {
373                         if (i != data.length-1 && data[i+1] != ' ' && seenFirstNonSpace) {
374                              result.append(data[i]); 
375                         }
376                         continue;
377                     } 
378                     seenFirstNonSpace = true;
379                     if (data[i] == '\t') {
380                         result.append("&#x9;");
381                     }
382                     else if (data[i] == '\n') {
383                         result.append("&#xA;");
384                     }
385                     else if (data[i] == '\r') {
386                         result.append("&#xD;");
387                     }
388                     else if (data[i] == '\"') {
389                         result.append("&quot;");
390                     }
391                     else if (data[i] == '&') {
392                         result.append("&amp;");
393                     }
394                     else if (data[i] == '<') {
395                         result.append("&lt;");
396                     }
397                     else {
398                         result.append(data[i]);
399                     }
400                 }
401             }
402     
403             return result.toString();    
404             
405         }
406         
407         
408         /**
409          * <p>
410          * Serializes a <code>Text</code> object
411          * onto the output stream using the UTF-8 encoding.
412          * The reserved characters &lt;, &gt;, and &amp;
413          * are escaped using the standard entity references such as
414          * <code>&amp;lt;</code>, <code>&amp;gt;</code>, 
415          * and <code>&amp;amp;</code>.
416          * </p>
417          * 
418          * @param text the <code>Text</code> to serialize
419          * 
420          * @throws IOException  if the underlying <code>OutputStream</code>
421          *     encounters an I/O error
422          */
423         protected final void write(Text text) throws IOException   {
424             String   input = text.getValue();
425             StringBuffer   result = new StringBuffer  (input.length());
426             for (int i = 0; i < input.length(); i++) {
427                 char c = input.charAt(i);
428                 if (c == '\r') {
429                     result.append("&#xD;");
430                 }
431                 else if (c == '&') {
432                     result.append("&amp;");
433                 }
434                 else if (c == '<') {
435                     result.append("&lt;");
436                 }
437                 else if (c == '>') {
438                     result.append("&gt;");
439                 }
440                 else { 
441                     result.append(c);   
442                 }            
443             }
444             writeRaw(result.toString());
445         }   
446     
447         
448         /**
449          * <p>
450          * Serializes a <code>Comment</code> object
451          * onto the output stream if and only if this
452          * serializer is configured to produce canonical XML
453          * with comments.
454          * </p>
455          * 
456          * @param comment the <code>Comment</code> to serialize
457          * 
458          * @throws IOException if the underlying <code>OutputStream</code>
459          *     encounters an I/O error
460          */
461         protected final void write(Comment comment) throws IOException   {
462             if (withComments) super.write(comment);
463         }
464         
465         
466         /**
467          * <p>
468          * Does nothing because canonical XML does not include
469          * document type declarations.
470          * </p>
471          * 
472          * @param doctype the document type declaration to serialize
473          */
474         protected final void write(DocType doctype) {
475             // DocType is not serialized in canonical XML
476         } 
477        
478         
479     }
480 
481     
482     /**
483      * <p>
484      * Serializes a document onto the output 
485      * stream using the canonical XML algorithm.
486      * </p>
487      * 
488      * @param doc the document to serialize
489      * 
490      * @throws IOException if the underlying <code>OutputStream</code>
491      *      encounters an I/O error
492      */
493     public final void write(Document doc) throws IOException   {  
494         serializer.write(doc);        
495         serializer.flush();
496     }  
497  
498     
499 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags