XmlReader


1   // This file is copied from the Rome project (https://rome.dev.java.net/),
2   // version 0.5, , which is licensed
3   // under the Apache V2 license (and doesn't include a NOTICE file)
4   /*
5    * Copyright 2004 Sun Microsystems, Inc.
6    *
7    * Licensed under the Apache License, Version 2.0 (the "License");
8    * you may not use this file except in compliance with the License.
9    * You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   *
19   */
20  package org.outerj.daisy.xmlutil;
21  
22  import java.io.*;
23  import java.net.URL  ;
24  import java.net.URLConnection  ;
25  import java.net.HttpURLConnection  ;
26  import java.util.regex.Pattern  ;
27  import java.util.regex.Matcher  ;
28  import java.text.MessageFormat  ;
29  
30  /**
31   * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
32   * the charset encoding of the XML document within the stream.
33   * <p>
34   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
35   * character stream.
36   * <p>
37   * All this has to be done without consuming characters from the stream, if not the XML parser
38   * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
39   * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all
40   * parsers).
41   * <p>
42   * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
43   * HTTP streams by offering a wide set of constructors.
44   * <P>
45   * By default the charset encoding detection is lenient, the constructor with the lenient flag
46   * can be used for an script (following HTTP MIME and XML specifications).
47   * All this is nicely explained by Mark Pilgrim in his blog,
48   * <a HREF="http://diveintomark.org/archives/2004/02/13/xml-media-types">
49   * Determining the character encoding of a feed</a>.
50   * <p>
51   * @author Alejandro Abdelnur
52   *
53   */
54  public class XmlReader extends Reader {
55      private static final int PUSHBACK_MAX_SIZE = 1024;
56  
57      private static final String   UTF_8 = "UTF-8";
58      private static final String   US_ASCII = "US-ASCII";
59      private static final String   UTF_16BE = "UTF-16BE";
60      private static final String   UTF_16LE = "UTF-16LE";
61      private static final String   UTF_16 = "UTF-16";
62  
63      private Reader _reader;
64      private String   _encoding;
65  
66      /**
67       * Creates a Reader for a File.
68       * <p>
69       * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
70       * missing defaults to UTF-8.
71       * <p>
72       * It does a lenient charset encoding detection, check the constructor with the lenient parameter
73       * for details.
74       * <p>
75       * @param file File to create a Reader from.
76       * @throws IOException thrown if there is a problem reading the file.
77       *
78       */
79      public XmlReader(File file) throws IOException {
80          this(new FileInputStream(file));
81      }
82  
83      /**
84       * Creates a Reader for a raw InputStream.
85       * <p>
86       * It follows the same logic used for files.
87       * <p>
88       * It does a lenient charset encoding detection, check the constructor with the lenient parameter
89       * for details.
90       * <p>
91       * @param is InputStream to create a Reader from.
92       * @throws IOException thrown if there is a problem reading the stream.
93       *
94       */
95      public XmlReader(InputStream is) throws IOException {
96          this(is,true);
97      }
98  
99      /**
100      * Creates a Reader for a raw InputStream.
101      * <p>
102      * It follows the same logic used for files.
103      * <p>
104      * If lenient detection is indicated and the detection above fails as per specifications it then attempts
105      * the following:
106      * <p>
107      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
108      * <p>
109      * Else if the XML prolog had a charset encoding that encoding is used.
110      * <p>
111      * Else if the content type had a charset encoding that encoding is used.
112      * <p>
113      * Else 'UTF-8' is used.
114      * <p>
115      * If lenient detection is indicated an XmlReaderException is never thrown.
116      * <p>
117      * @param is InputStream to create a Reader from.
118      * @param lenient indicates if the charset encoding detection should be relaxed.
119      * @throws IOException thrown if there is a problem reading the stream.
120      * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
121      *
122      */
123     public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
124         try {
125             doRawStream(is);
126         }
127         catch (XmlReaderException ex) {
128             if (!lenient) {
129                 throw ex;
130             }
131             else {
132                 doLenientDetection(null,ex);
133             }
134         }
135     }
136 
137     /**
138      * Creates a Reader using the InputStream of a URL.
139      * <p>
140      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
141      * data it uses the same logic used for Files.
142      * <p>
143      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
144      * data it uses the same logic used for an InputStream with content-type.
145      * <p>
146      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
147      * for details.
148      * <p>
149      * @param url URL to create a Reader from.
150      * @throws IOException thrown if there is a problem reading the stream of the URL.
151      *
152      */
153     public XmlReader(URL   url) throws IOException {
154         this(url.openConnection());
155     }
156 
157     /**
158      * Creates a Reader using the InputStream of a URLConnection.
159      * <p>
160      * If the URLConnection is not of type HttpURLConnection and there is not
161      * 'content-type' header in the fetched data it uses the same logic used for files.
162      * <p>
163      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
164      * data it uses the same logic used for an InputStream with content-type.
165      * <p>
166      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
167      * for details.
168      * <p>
169      * @param conn URLConnection to create a Reader from.
170      * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
171      *
172      */
173     public XmlReader(URLConnection   conn) throws IOException {
174         if (conn instanceof HttpURLConnection  ) {
175             try {
176                 doHttpStream(conn.getInputStream(),conn.getContentType());
177             }
178             catch (XmlReaderException ex) {
179                 doLenientDetection(conn.getContentType(),ex);
180             }
181         }
182         else
183         if (conn.getContentType()!=null) {
184             try {
185                 doHttpStream(conn.getInputStream(),conn.getContentType());
186             }
187             catch (XmlReaderException ex) {
188                 doLenientDetection(conn.getContentType(),ex);
189             }
190         }
191         else {
192             try {
193                 doRawStream(conn.getInputStream());
194             }
195             catch (XmlReaderException ex) {
196                 doLenientDetection(null,ex);
197             }
198         }
199     }
200 
201     /**
202      * Creates a Reader using an InputStream an the associated content-type header.
203      * <p>
204      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
205      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
206      * prolog encoding uses the default encoding mandated by the content-type MIME type.
207      * <p>
208      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
209      * for details.
210      * <p>
211      * @param is InputStream to create the reader from.
212      * @param httpContentType content-type header to use for the resolution of the charset encoding.
213      * @throws IOException thrown if there is a problem reading the file.
214      *
215      */
216     public XmlReader(InputStream is,String   httpContentType) throws IOException {
217         this(is,httpContentType,true);
218     }
219 
220     /**
221      * Creates a Reader using an InputStream an the associated content-type header. This constructor is
222      * lenient regarding the encoding detection.
223      * <p>
224      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
225      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
226      * prolog encoding uses the default encoding mandated by the content-type MIME type.
227      * <p>
228      * If lenient detection is indicated and the detection above fails as per specifications it then attempts
229      * the following:
230      * <p>
231      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
232      * <p>
233      * Else if the XML prolog had a charset encoding that encoding is used.
234      * <p>
235      * Else if the content type had a charset encoding that encoding is used.
236      * <p>
237      * Else 'UTF-8' is used.
238      * <p>
239      * If lenient detection is indicated an XmlReaderException is never thrown.
240      * <p>
241      * @param is InputStream to create the reader from.
242      * @param httpContentType content-type header to use for the resolution of the charset encoding.
243      * @param lenient indicates if the charset encoding detection should be relaxed.
244      * @throws IOException thrown if there is a problem reading the file.
245      * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
246      *
247      */
248     public XmlReader(InputStream is,String   httpContentType,boolean lenient) throws IOException, XmlReaderException {
249         try {
250             doHttpStream(is,httpContentType);
251         }
252         catch (XmlReaderException ex) {
253             if (!lenient) {
254                 throw ex;
255             }
256             else {
257                 doLenientDetection(httpContentType,ex);
258             }
259         }
260     }
261 
262     private void doLenientDetection(String   httpContentType,XmlReaderException ex) throws IOException {
263         if (httpContentType!=null) {
264             if (httpContentType.startsWith("text/html")) {
265                 httpContentType = httpContentType.substring("text/html".length());
266                 httpContentType = "text/xml" + httpContentType;
267                 try {
268                     doHttpStream(ex.getInputStream(),httpContentType);
269                     ex = null;
270                 }
271                 catch (XmlReaderException ex2) {
272                     ex = ex2;
273                 }
274             }
275         }
276         if (ex!=null) {
277             String   encoding = ex.getXmlEncoding();
278             if (encoding==null) {
279                 encoding = ex.getContentTypeEncoding();
280             }
281             if (encoding==null) {
282                 encoding = UTF_8;
283             }
284             prepareReader(ex.getInputStream(),encoding);
285         }
286     }
287 
288     /**
289      * Returns the charset encoding of the XmlReader.
290      * <p>
291      * @return charset encoding.
292      *
293      */
294     public String   getEncoding() {
295         return _encoding;
296     }
297 
298     public int read(char[] buf,int offset,int len) throws IOException {
299         return _reader.read(buf,offset,len);
300     }
301 
302     /**
303      * Closes the XmlReader stream.
304      * <p>
305      * @throws IOException thrown if there was a problem closing the stream.
306      *
307      */
308     public void close() throws IOException {
309         _reader.close();
310     }
311 
312     private void doRawStream(InputStream is) throws IOException {
313         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
314         String   bomEnc = getBOMEncoding(pis);
315         String   xmlGuessEnc =  getXMLGuessEncoding(pis);
316         String   xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
317         String   encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
318         prepareReader(pis,encoding);
319     }
320 
321     private void doHttpStream(InputStream is,String   httpContentType) throws IOException {
322         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
323         String   cTMime = getContentTypeMime(httpContentType);
324         String   cTEnc  = getContentTypeEncoding(httpContentType);
325         String   bomEnc = getBOMEncoding(pis);
326         String   xmlGuessEnc =  getXMLGuessEncoding(pis);
327         String   xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
328         String   encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis);
329         prepareReader(pis,encoding);
330     }
331 
332     private void prepareReader(InputStream is,String   encoding) throws IOException {
333         _reader = new InputStreamReader(is,encoding);
334         _encoding = encoding;
335     }
336 
337     // InputStream is passed for XmlReaderException creation only
338     private static String   calculateRawEncoding(String   bomEnc, String   xmlGuessEnc, String   xmlEnc, InputStream is) throws IOException {
339         String   encoding;
340         if (bomEnc==null) {
341             if (xmlGuessEnc==null || xmlEnc==null) {
342                 encoding = UTF_8;
343             }
344             else
345             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
346                 encoding = xmlGuessEnc;
347             }
348             else {
349                 encoding = xmlEnc;
350             }
351         }
352         else
353         if (bomEnc.equals(UTF_8)) {
354             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
355                 throw new XmlReaderException(RAW_EX_1.format(new Object  []{bomEnc,xmlGuessEnc,xmlEnc}),
356                                              bomEnc,xmlGuessEnc,xmlEnc,is);
357             }
358             if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
359                 throw new XmlReaderException(RAW_EX_1.format(new Object  []{bomEnc,xmlGuessEnc,xmlEnc}),
360                                              bomEnc,xmlGuessEnc,xmlEnc,is);
361             }
362             encoding = UTF_8;
363         }
364         else
365         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
366             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
367                 throw new IOException(RAW_EX_1.format(new Object  []{bomEnc,xmlGuessEnc,xmlEnc}));
368             }
369             if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
370                 throw new XmlReaderException(RAW_EX_1.format(new Object  []{bomEnc,xmlGuessEnc,xmlEnc}),
371                                              bomEnc,xmlGuessEnc,xmlEnc,is);
372             }
373             encoding =bomEnc;
374         }
375         else {
376             throw new XmlReaderException(RAW_EX_2.format(new Object  []{bomEnc,xmlGuessEnc,xmlEnc}),
377                                          bomEnc,xmlGuessEnc,xmlEnc,is);
378         }
379         return encoding;
380     }
381 
382     // InputStream is passed for XmlReaderException creation only
383     private static String   calculateHttpEncoding(String   cTMime, String   cTEnc, String   bomEnc, String   xmlGuessEnc, String   xmlEnc, InputStream is) throws IOException {
384         boolean appXml = isAppXml(cTMime);
385         boolean textXml = isTextXml(cTMime);
386         String   encoding;
387         if (appXml || textXml) {
388             if (cTEnc==null) {
389                 if (appXml) {
390                     encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
391                 }
392                 else {
393                     encoding = US_ASCII;
394                 }
395             }
396             else
397             if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
398                 throw new XmlReaderException(HTTP_EX_1.format(new Object  []{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
399                                              cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
400             }
401             else
402             if (cTEnc.equals(UTF_16)) {
403                 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
404                     encoding = bomEnc;
405                 }
406                 else {
407                     throw new XmlReaderException(HTTP_EX_2.format(new Object  []{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
408                                                  cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
409                 }
410             }
411             else {
412                 encoding = cTEnc;
413             }
414         }
415         else {
416             throw new XmlReaderException(HTTP_EX_3.format(new Object  []{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
417                                          cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
418         }
419         return encoding;
420     }
421 
422     // returns MIME type or NULL if httpContentType is NULL
423     private static String   getContentTypeMime(String   httpContentType) {
424         String   mime = null;
425         if (httpContentType!=null) {
426             int i = httpContentType.indexOf(";");
427             mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
428         }
429         return mime;
430     }
431 
432     private static final Pattern   CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
433 
434     // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
435     private static String   getContentTypeEncoding(String   httpContentType) {
436         String   encoding = null;
437         if (httpContentType!=null) {
438             int i = httpContentType.indexOf(";");
439             if (i>-1) {
440                 String   postMime = httpContentType.substring(i+1);
441                 Matcher   m = CHARSET_PATTERN.matcher(postMime);
442                 encoding = (m.find()) ? m.group(1) : null;
443                 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
444             }
445         }
446         return encoding;
447     }
448 
449     // returns the BOM in the stream, NULL if not present,
450     // if there was BOM the in the stream it is consumed
451     private static String   getBOMEncoding(PushbackInputStream is) throws IOException {
452         String   encoding = null;
453         int[] bytes = new int[3];
454         bytes[0] = is.read();
455         bytes[1] = is.read();
456         bytes[2] = is.read();
457 
458         if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
459             encoding = UTF_16BE;
460             is.unread(bytes[2]);
461         }
462         else
463         if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
464             encoding = UTF_16LE;
465             is.unread(bytes[2]);
466         }
467         else
468         if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
469             encoding = UTF_8;
470         }
471         else {
472             for (int i=bytes.length-1;i>=0;i--) {
473                 is.unread(bytes[i]);
474             }
475         }
476         return encoding;
477     }
478 
479     // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
480     private static String   getXMLGuessEncoding(PushbackInputStream is) throws IOException {
481         String   encoding = null;
482         int[] bytes = new int[4];
483         bytes[0] = is.read();
484         bytes[1] = is.read();
485         bytes[2] = is.read();
486         bytes[3] = is.read();
487         for (int i=bytes.length-1;i>=0;i--) {
488             is.unread(bytes[i]);
489         }
490 
491         if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
492                 encoding = UTF_16BE;
493         }
494         else
495         if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
496                 encoding = UTF_16LE;
497         }
498         else
499         if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
500             encoding = UTF_8;
501         }
502         return encoding;
503     }
504 
505     private static final Pattern   ENCODING_PATTERN = Pattern.compile("^<\\?xml.*encoding=\"(.*)\".*\\?>");
506 
507     // returns the encoding declared in the <?xml encoding=...?>,  NULL if none
508     private static String   getXMLPrologEncoding(PushbackInputStream is,String   guessedEnc) throws IOException {
509         String   encoding = null;
510         if (guessedEnc!=null) {
511             byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
512             int offset = 0;
513             int max = PUSHBACK_MAX_SIZE;
514             int c = is.read(bytes,offset,max);
515             while (c!=-1 && offset<PUSHBACK_MAX_SIZE) {
516                 offset += c;
517                 max -= c;
518                 c = is.read(bytes,offset,max);
519             }
520             int bytesRead = offset;
521             if (bytesRead>0) {
522                 is.unread(bytes,0,bytesRead);
523                 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc);
524                 BufferedReader br = new BufferedReader(reader);
525                 String   prolog = br.readLine();
526                 Matcher   m = ENCODING_PATTERN.matcher(prolog);
527                 encoding = (m.find()) ? m.group(1).toUpperCase() : null;
528             }
529         }
530         return encoding;
531     }
532 
533     // indicates if the MIME type belongs to the APPLICATION XML family
534     private static boolean isAppXml(String   mime) {
535         return mime!=null &&
536                (mime.equals("application/xml") ||
537                 mime.equals("application/xml-dtd") ||
538                 mime.equals("application/xml-external-parsed-entity") ||
539                 (mime.startsWith("application/") && mime.endsWith("+xml")));
540     }
541 
542     // indicates if the MIME type belongs to the TEXT XML family
543     private static boolean isTextXml(String   mime) {
544         return mime!=null &&
545                (mime.equals("text/xml") ||
546                 mime.equals("text/xml-external-parsed-entity") ||
547                 (mime.startsWith("text/") && mime.endsWith("+xml")));
548     }
549 
550     private static final MessageFormat   RAW_EX_1 = new MessageFormat  (
551             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
552 
553     private static final MessageFormat   RAW_EX_2 = new MessageFormat  (
554             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
555 
556     private static final MessageFormat   HTTP_EX_1 = new MessageFormat  (
557             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
558 
559     private static final MessageFormat   HTTP_EX_2 = new MessageFormat  (
560             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
561 
562     private static final MessageFormat   HTTP_EX_3 = new MessageFormat  (
563             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
564 
565 }
566
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags