KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > xmlutil > XmlReader


1 // This file is copied from the Rome project (https://rome.dev.java.net/),
2
// version 0.5, , which is licensed
3
// under the Apache V2 license (and doesn't include a NOTICE file)
4
/*
5  * Copyright 2004 Sun Microsystems, Inc.
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  */

20 package org.outerj.daisy.xmlutil;
21
22 import java.io.*;
23 import java.net.URL JavaDoc;
24 import java.net.URLConnection JavaDoc;
25 import java.net.HttpURLConnection JavaDoc;
26 import java.util.regex.Pattern JavaDoc;
27 import java.util.regex.Matcher JavaDoc;
28 import java.text.MessageFormat JavaDoc;
29
30 /**
31  * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
32  * the charset encoding of the XML document within the stream.
33  * <p>
34  * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
35  * character stream.
36  * <p>
37  * All this has to be done without consuming characters from the stream, if not the XML parser
38  * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
39  * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all
40  * parsers).
41  * <p>
42  * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
43  * HTTP streams by offering a wide set of constructors.
44  * <P>
45  * By default the charset encoding detection is lenient, the constructor with the lenient flag
46  * can be used for an script (following HTTP MIME and XML specifications).
47  * All this is nicely explained by Mark Pilgrim in his blog,
48  * <a HREF="http://diveintomark.org/archives/2004/02/13/xml-media-types">
49  * Determining the character encoding of a feed</a>.
50  * <p>
51  * @author Alejandro Abdelnur
52  *
53  */

54 public class XmlReader extends Reader {
55     private static final int PUSHBACK_MAX_SIZE = 1024;
56
57     private static final String JavaDoc UTF_8 = "UTF-8";
58     private static final String JavaDoc US_ASCII = "US-ASCII";
59     private static final String JavaDoc UTF_16BE = "UTF-16BE";
60     private static final String JavaDoc UTF_16LE = "UTF-16LE";
61     private static final String JavaDoc UTF_16 = "UTF-16";
62
63     private Reader _reader;
64     private String JavaDoc _encoding;
65
66     /**
67      * Creates a Reader for a File.
68      * <p>
69      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
70      * missing defaults to UTF-8.
71      * <p>
72      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
73      * for details.
74      * <p>
75      * @param file File to create a Reader from.
76      * @throws IOException thrown if there is a problem reading the file.
77      *
78      */

79     public XmlReader(File file) throws IOException {
80         this(new FileInputStream(file));
81     }
82
83     /**
84      * Creates a Reader for a raw InputStream.
85      * <p>
86      * It follows the same logic used for files.
87      * <p>
88      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
89      * for details.
90      * <p>
91      * @param is InputStream to create a Reader from.
92      * @throws IOException thrown if there is a problem reading the stream.
93      *
94      */

95     public XmlReader(InputStream is) throws IOException {
96         this(is,true);
97     }
98
99     /**
100      * Creates a Reader for a raw InputStream.
101      * <p>
102      * It follows the same logic used for files.
103      * <p>
104      * If lenient detection is indicated and the detection above fails as per specifications it then attempts
105      * the following:
106      * <p>
107      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
108      * <p>
109      * Else if the XML prolog had a charset encoding that encoding is used.
110      * <p>
111      * Else if the content type had a charset encoding that encoding is used.
112      * <p>
113      * Else 'UTF-8' is used.
114      * <p>
115      * If lenient detection is indicated an XmlReaderException is never thrown.
116      * <p>
117      * @param is InputStream to create a Reader from.
118      * @param lenient indicates if the charset encoding detection should be relaxed.
119      * @throws IOException thrown if there is a problem reading the stream.
120      * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
121      *
122      */

123     public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
124         try {
125             doRawStream(is);
126         }
127         catch (XmlReaderException ex) {
128             if (!lenient) {
129                 throw ex;
130             }
131             else {
132                 doLenientDetection(null,ex);
133             }
134         }
135     }
136
137     /**
138      * Creates a Reader using the InputStream of a URL.
139      * <p>
140      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
141      * data it uses the same logic used for Files.
142      * <p>
143      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
144      * data it uses the same logic used for an InputStream with content-type.
145      * <p>
146      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
147      * for details.
148      * <p>
149      * @param url URL to create a Reader from.
150      * @throws IOException thrown if there is a problem reading the stream of the URL.
151      *
152      */

153     public XmlReader(URL JavaDoc url) throws IOException {
154         this(url.openConnection());
155     }
156
157     /**
158      * Creates a Reader using the InputStream of a URLConnection.
159      * <p>
160      * If the URLConnection is not of type HttpURLConnection and there is not
161      * 'content-type' header in the fetched data it uses the same logic used for files.
162      * <p>
163      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
164      * data it uses the same logic used for an InputStream with content-type.
165      * <p>
166      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
167      * for details.
168      * <p>
169      * @param conn URLConnection to create a Reader from.
170      * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
171      *
172      */

173     public XmlReader(URLConnection JavaDoc conn) throws IOException {
174         if (conn instanceof HttpURLConnection JavaDoc) {
175             try {
176                 doHttpStream(conn.getInputStream(),conn.getContentType());
177             }
178             catch (XmlReaderException ex) {
179                 doLenientDetection(conn.getContentType(),ex);
180             }
181         }
182         else
183         if (conn.getContentType()!=null) {
184             try {
185                 doHttpStream(conn.getInputStream(),conn.getContentType());
186             }
187             catch (XmlReaderException ex) {
188                 doLenientDetection(conn.getContentType(),ex);
189             }
190         }
191         else {
192             try {
193                 doRawStream(conn.getInputStream());
194             }
195             catch (XmlReaderException ex) {
196                 doLenientDetection(null,ex);
197             }
198         }
199     }
200
201     /**
202      * Creates a Reader using an InputStream an the associated content-type header.
203      * <p>
204      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
205      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
206      * prolog encoding uses the default encoding mandated by the content-type MIME type.
207      * <p>
208      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
209      * for details.
210      * <p>
211      * @param is InputStream to create the reader from.
212      * @param httpContentType content-type header to use for the resolution of the charset encoding.
213      * @throws IOException thrown if there is a problem reading the file.
214      *
215      */

216     public XmlReader(InputStream is,String JavaDoc httpContentType) throws IOException {
217         this(is,httpContentType,true);
218     }
219
220     /**
221      * Creates a Reader using an InputStream an the associated content-type header. This constructor is
222      * lenient regarding the encoding detection.
223      * <p>
224      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
225      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
226      * prolog encoding uses the default encoding mandated by the content-type MIME type.
227      * <p>
228      * If lenient detection is indicated and the detection above fails as per specifications it then attempts
229      * the following:
230      * <p>
231      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
232      * <p>
233      * Else if the XML prolog had a charset encoding that encoding is used.
234      * <p>
235      * Else if the content type had a charset encoding that encoding is used.
236      * <p>
237      * Else 'UTF-8' is used.
238      * <p>
239      * If lenient detection is indicated an XmlReaderException is never thrown.
240      * <p>
241      * @param is InputStream to create the reader from.
242      * @param httpContentType content-type header to use for the resolution of the charset encoding.
243      * @param lenient indicates if the charset encoding detection should be relaxed.
244      * @throws IOException thrown if there is a problem reading the file.
245      * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
246      *
247      */

248     public XmlReader(InputStream is,String JavaDoc httpContentType,boolean lenient) throws IOException, XmlReaderException {
249         try {
250             doHttpStream(is,httpContentType);
251         }
252         catch (XmlReaderException ex) {
253             if (!lenient) {
254                 throw ex;
255             }
256             else {
257                 doLenientDetection(httpContentType,ex);
258             }
259         }
260     }
261
262     private void doLenientDetection(String JavaDoc httpContentType,XmlReaderException ex) throws IOException {
263         if (httpContentType!=null) {
264             if (httpContentType.startsWith("text/html")) {
265                 httpContentType = httpContentType.substring("text/html".length());
266                 httpContentType = "text/xml" + httpContentType;
267                 try {
268                     doHttpStream(ex.getInputStream(),httpContentType);
269                     ex = null;
270                 }
271                 catch (XmlReaderException ex2) {
272                     ex = ex2;
273                 }
274             }
275         }
276         if (ex!=null) {
277             String JavaDoc encoding = ex.getXmlEncoding();
278             if (encoding==null) {
279                 encoding = ex.getContentTypeEncoding();
280             }
281             if (encoding==null) {
282                 encoding = UTF_8;
283             }
284             prepareReader(ex.getInputStream(),encoding);
285         }
286     }
287
288     /**
289      * Returns the charset encoding of the XmlReader.
290      * <p>
291      * @return charset encoding.
292      *
293      */

294     public String JavaDoc getEncoding() {
295         return _encoding;
296     }
297
298     public int read(char[] buf,int offset,int len) throws IOException {
299         return _reader.read(buf,offset,len);
300     }
301
302     /**
303      * Closes the XmlReader stream.
304      * <p>
305      * @throws IOException thrown if there was a problem closing the stream.
306      *
307      */

308     public void close() throws IOException {
309         _reader.close();
310     }
311
312     private void doRawStream(InputStream is) throws IOException {
313         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
314         String JavaDoc bomEnc = getBOMEncoding(pis);
315         String JavaDoc xmlGuessEnc = getXMLGuessEncoding(pis);
316         String JavaDoc xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
317         String JavaDoc encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
318         prepareReader(pis,encoding);
319     }
320
321     private void doHttpStream(InputStream is,String JavaDoc httpContentType) throws IOException {
322         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
323         String JavaDoc cTMime = getContentTypeMime(httpContentType);
324         String JavaDoc cTEnc = getContentTypeEncoding(httpContentType);
325         String JavaDoc bomEnc = getBOMEncoding(pis);
326         String JavaDoc xmlGuessEnc = getXMLGuessEncoding(pis);
327         String JavaDoc xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
328         String JavaDoc encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis);
329         prepareReader(pis,encoding);
330     }
331
332     private void prepareReader(InputStream is,String JavaDoc encoding) throws IOException {
333         _reader = new InputStreamReader(is,encoding);
334         _encoding = encoding;
335     }
336
337     // InputStream is passed for XmlReaderException creation only
338
private static String JavaDoc calculateRawEncoding(String JavaDoc bomEnc, String JavaDoc xmlGuessEnc, String JavaDoc xmlEnc, InputStream is) throws IOException {
339         String JavaDoc encoding;
340         if (bomEnc==null) {
341             if (xmlGuessEnc==null || xmlEnc==null) {
342                 encoding = UTF_8;
343             }
344             else
345             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
346                 encoding = xmlGuessEnc;
347             }
348             else {
349                 encoding = xmlEnc;
350             }
351         }
352         else
353         if (bomEnc.equals(UTF_8)) {
354             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
355                 throw new XmlReaderException(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}),
356                                              bomEnc,xmlGuessEnc,xmlEnc,is);
357             }
358             if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
359                 throw new XmlReaderException(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}),
360                                              bomEnc,xmlGuessEnc,xmlEnc,is);
361             }
362             encoding = UTF_8;
363         }
364         else
365         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
366             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
367                 throw new IOException(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}));
368             }
369             if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
370                 throw new XmlReaderException(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}),
371                                              bomEnc,xmlGuessEnc,xmlEnc,is);
372             }
373             encoding =bomEnc;
374         }
375         else {
376             throw new XmlReaderException(RAW_EX_2.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}),
377                                          bomEnc,xmlGuessEnc,xmlEnc,is);
378         }
379         return encoding;
380     }
381
382     // InputStream is passed for XmlReaderException creation only
383
private static String JavaDoc calculateHttpEncoding(String JavaDoc cTMime, String JavaDoc cTEnc, String JavaDoc bomEnc, String JavaDoc xmlGuessEnc, String JavaDoc xmlEnc, InputStream is) throws IOException {
384         boolean appXml = isAppXml(cTMime);
385         boolean textXml = isTextXml(cTMime);
386         String JavaDoc encoding;
387         if (appXml || textXml) {
388             if (cTEnc==null) {
389                 if (appXml) {
390                     encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
391                 }
392                 else {
393                     encoding = US_ASCII;
394                 }
395             }
396             else
397             if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
398                 throw new XmlReaderException(HTTP_EX_1.format(new Object JavaDoc[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
399                                              cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
400             }
401             else
402             if (cTEnc.equals(UTF_16)) {
403                 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
404                     encoding = bomEnc;
405                 }
406                 else {
407                     throw new XmlReaderException(HTTP_EX_2.format(new Object JavaDoc[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
408                                                  cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
409                 }
410             }
411             else {
412                 encoding = cTEnc;
413             }
414         }
415         else {
416             throw new XmlReaderException(HTTP_EX_3.format(new Object JavaDoc[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
417                                          cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
418         }
419         return encoding;
420     }
421
422     // returns MIME type or NULL if httpContentType is NULL
423
private static String JavaDoc getContentTypeMime(String JavaDoc httpContentType) {
424         String JavaDoc mime = null;
425         if (httpContentType!=null) {
426             int i = httpContentType.indexOf(";");
427             mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
428         }
429         return mime;
430     }
431
432     private static final Pattern JavaDoc CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
433
434     // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
435
private static String JavaDoc getContentTypeEncoding(String JavaDoc httpContentType) {
436         String JavaDoc encoding = null;
437         if (httpContentType!=null) {
438             int i = httpContentType.indexOf(";");
439             if (i>-1) {
440                 String JavaDoc postMime = httpContentType.substring(i+1);
441                 Matcher JavaDoc m = CHARSET_PATTERN.matcher(postMime);
442                 encoding = (m.find()) ? m.group(1) : null;
443                 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
444             }
445         }
446         return encoding;
447     }
448
449     // returns the BOM in the stream, NULL if not present,
450
// if there was BOM the in the stream it is consumed
451
private static String JavaDoc getBOMEncoding(PushbackInputStream is) throws IOException {
452         String JavaDoc encoding = null;
453         int[] bytes = new int[3];
454         bytes[0] = is.read();
455         bytes[1] = is.read();
456         bytes[2] = is.read();
457
458         if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
459             encoding = UTF_16BE;
460             is.unread(bytes[2]);
461         }
462         else
463         if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
464             encoding = UTF_16LE;
465             is.unread(bytes[2]);
466         }
467         else
468         if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
469             encoding = UTF_8;
470         }
471         else {
472             for (int i=bytes.length-1;i>=0;i--) {
473                 is.unread(bytes[i]);
474             }
475         }
476         return encoding;
477     }
478
479     // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
480
private static String JavaDoc getXMLGuessEncoding(PushbackInputStream is) throws IOException {
481         String JavaDoc encoding = null;
482         int[] bytes = new int[4];
483         bytes[0] = is.read();
484         bytes[1] = is.read();
485         bytes[2] = is.read();
486         bytes[3] = is.read();
487         for (int i=bytes.length-1;i>=0;i--) {
488             is.unread(bytes[i]);
489         }
490
491         if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
492                 encoding = UTF_16BE;
493         }
494         else
495         if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
496                 encoding = UTF_16LE;
497         }
498         else
499         if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
500             encoding = UTF_8;
501         }
502         return encoding;
503     }
504
505     private static final Pattern JavaDoc ENCODING_PATTERN = Pattern.compile("^<\\?xml.*encoding=\"(.*)\".*\\?>");
506
507     // returns the encoding declared in the <?xml encoding=...?>, NULL if none
508
private static String JavaDoc getXMLPrologEncoding(PushbackInputStream is,String JavaDoc guessedEnc) throws IOException {
509         String JavaDoc encoding = null;
510         if (guessedEnc!=null) {
511             byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
512             int offset = 0;
513             int max = PUSHBACK_MAX_SIZE;
514             int c = is.read(bytes,offset,max);
515             while (c!=-1 && offset<PUSHBACK_MAX_SIZE) {
516                 offset += c;
517                 max -= c;
518                 c = is.read(bytes,offset,max);
519             }
520             int bytesRead = offset;
521             if (bytesRead>0) {
522                 is.unread(bytes,0,bytesRead);
523                 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc);
524                 BufferedReader br = new BufferedReader(reader);
525                 String JavaDoc prolog = br.readLine();
526                 Matcher JavaDoc m = ENCODING_PATTERN.matcher(prolog);
527                 encoding = (m.find()) ? m.group(1).toUpperCase() : null;
528             }
529         }
530         return encoding;
531     }
532
533     // indicates if the MIME type belongs to the APPLICATION XML family
534
private static boolean isAppXml(String JavaDoc mime) {
535         return mime!=null &&
536                (mime.equals("application/xml") ||
537                 mime.equals("application/xml-dtd") ||
538                 mime.equals("application/xml-external-parsed-entity") ||
539                 (mime.startsWith("application/") && mime.endsWith("+xml")));
540     }
541
542     // indicates if the MIME type belongs to the TEXT XML family
543
private static boolean isTextXml(String JavaDoc mime) {
544         return mime!=null &&
545                (mime.equals("text/xml") ||
546                 mime.equals("text/xml-external-parsed-entity") ||
547                 (mime.startsWith("text/") && mime.endsWith("+xml")));
548     }
549
550     private static final MessageFormat JavaDoc RAW_EX_1 = new MessageFormat JavaDoc(
551             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
552
553     private static final MessageFormat JavaDoc RAW_EX_2 = new MessageFormat JavaDoc(
554             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
555
556     private static final MessageFormat JavaDoc HTTP_EX_1 = new MessageFormat JavaDoc(
557             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
558
559     private static final MessageFormat JavaDoc HTTP_EX_2 = new MessageFormat JavaDoc(
560             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
561
562     private static final MessageFormat JavaDoc HTTP_EX_3 = new MessageFormat JavaDoc(
563             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
564
565 }
566
Popular Tags