KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > sun > syndication > io > XmlReader


1 /*
2  * Copyright 2004 Sun Microsystems, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */

17 package com.sun.syndication.io;
18
19 import java.io.*;
20 import java.net.URL JavaDoc;
21 import java.net.URLConnection JavaDoc;
22 import java.net.HttpURLConnection JavaDoc;
23 import java.util.regex.Pattern JavaDoc;
24 import java.util.regex.Matcher JavaDoc;
25 import java.text.MessageFormat JavaDoc;
26
27 /**
28  * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
29  * the charset encoding of the XML document within the stream.
30  * <p>
31  * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
32  * character stream.
33  * <p>
34  * All this has to be done without consuming characters from the stream, if not the XML parser
35  * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
36  * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all
37  * parsers).
38  * <p>
39  * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
40  * HTTP streams by offering a wide set of constructors.
41  * <P>
42  * By default the charset encoding detection is lenient, the constructor with the lenient flag
43  * can be used for an script (following HTTP MIME and XML specifications).
44  * All this is nicely explained by Mark Pilgrim in his blog,
45  * <a HREF="http://diveintomark.org/archives/2004/02/13/xml-media-types">
46  * Determining the character encoding of a feed</a>.
47  * <p>
48  * @author Alejandro Abdelnur
49  *
50  */

51 public class XmlReader extends Reader {
52     private static final int PUSHBACK_MAX_SIZE = 4096;
53
54     private static final String JavaDoc UTF_8 = "UTF-8";
55     private static final String JavaDoc US_ASCII = "US-ASCII";
56     private static final String JavaDoc UTF_16BE = "UTF-16BE";
57     private static final String JavaDoc UTF_16LE = "UTF-16LE";
58     private static final String JavaDoc UTF_16 = "UTF-16";
59
60     private Reader _reader;
61     private String JavaDoc _encoding;
62
63     /**
64      * Creates a Reader for a File.
65      * <p>
66      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
67      * missing defaults to UTF-8.
68      * <p>
69      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
70      * for details.
71      * <p>
72      * @param file File to create a Reader from.
73      * @throws IOException thrown if there is a problem reading the file.
74      *
75      */

76     public XmlReader(File file) throws IOException {
77         this(new FileInputStream(file));
78     }
79
80     /**
81      * Creates a Reader for a raw InputStream.
82      * <p>
83      * It follows the same logic used for files.
84      * <p>
85      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
86      * for details.
87      * <p>
88      * @param is InputStream to create a Reader from.
89      * @throws IOException thrown if there is a problem reading the stream.
90      *
91      */

92     public XmlReader(InputStream is) throws IOException {
93         this(is,true);
94     }
95
96     /**
97      * Creates a Reader for a raw InputStream.
98      * <p>
99      * It follows the same logic used for files.
100      * <p>
101      * If lenient detection is indicated and the detection above fails as per specifications it then attempts
102      * the following:
103      * <p>
104      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
105      * <p>
106      * Else if the XML prolog had a charset encoding that encoding is used.
107      * <p>
108      * Else if the content type had a charset encoding that encoding is used.
109      * <p>
110      * Else 'UTF-8' is used.
111      * <p>
112      * If lenient detection is indicated an XmlReaderException is never thrown.
113      * <p>
114      * @param is InputStream to create a Reader from.
115      * @param lenient indicates if the charset encoding detection should be relaxed.
116      * @throws IOException thrown if there is a problem reading the stream.
117      * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
118      *
119      */

120     public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
121         try {
122             doRawStream(is,lenient);
123         }
124         catch (XmlReaderException ex) {
125             if (!lenient) {
126                 throw ex;
127             }
128             else {
129                 doLenientDetection(null,ex);
130             }
131         }
132     }
133
134     /**
135      * Creates a Reader using the InputStream of a URL.
136      * <p>
137      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
138      * data it uses the same logic used for Files.
139      * <p>
140      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
141      * data it uses the same logic used for an InputStream with content-type.
142      * <p>
143      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
144      * for details.
145      * <p>
146      * @param url URL to create a Reader from.
147      * @throws IOException thrown if there is a problem reading the stream of the URL.
148      *
149      */

150     public XmlReader(URL JavaDoc url) throws IOException {
151         this(url.openConnection());
152     }
153
154     /**
155      * Creates a Reader using the InputStream of a URLConnection.
156      * <p>
157      * If the URLConnection is not of type HttpURLConnection and there is not
158      * 'content-type' header in the fetched data it uses the same logic used for files.
159      * <p>
160      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
161      * data it uses the same logic used for an InputStream with content-type.
162      * <p>
163      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
164      * for details.
165      * <p>
166      * @param conn URLConnection to create a Reader from.
167      * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
168      *
169      */

170     public XmlReader(URLConnection JavaDoc conn) throws IOException {
171         boolean lenient = true;
172         if (conn instanceof HttpURLConnection JavaDoc) {
173             try {
174                 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
175             }
176             catch (XmlReaderException ex) {
177                 doLenientDetection(conn.getContentType(),ex);
178             }
179         }
180         else
181         if (conn.getContentType()!=null) {
182             try {
183                 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
184             }
185             catch (XmlReaderException ex) {
186                 doLenientDetection(conn.getContentType(),ex);
187             }
188         }
189         else {
190             try {
191                 doRawStream(conn.getInputStream(),lenient);
192             }
193             catch (XmlReaderException ex) {
194                 doLenientDetection(null,ex);
195             }
196         }
197     }
198
199     /**
200      * Creates a Reader using an InputStream an the associated content-type header.
201      * <p>
202      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
203      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
204      * prolog encoding uses the default encoding mandated by the content-type MIME type.
205      * <p>
206      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
207      * for details.
208      * <p>
209      * @param is InputStream to create the reader from.
210      * @param httpContentType content-type header to use for the resolution of the charset encoding.
211      * @throws IOException thrown if there is a problem reading the file.
212      *
213      */

214     public XmlReader(InputStream is,String JavaDoc httpContentType) throws IOException {
215         this(is,httpContentType,true);
216     }
217
218     /**
219      * Creates a Reader using an InputStream an the associated content-type header. This constructor is
220      * lenient regarding the encoding detection.
221      * <p>
222      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
223      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
224      * prolog encoding uses the default encoding mandated by the content-type MIME type.
225      * <p>
226      * If lenient detection is indicated and the detection above fails as per specifications it then attempts
227      * the following:
228      * <p>
229      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
230      * <p>
231      * Else if the XML prolog had a charset encoding that encoding is used.
232      * <p>
233      * Else if the content type had a charset encoding that encoding is used.
234      * <p>
235      * Else 'UTF-8' is used.
236      * <p>
237      * If lenient detection is indicated an XmlReaderException is never thrown.
238      * <p>
239      * @param is InputStream to create the reader from.
240      * @param httpContentType content-type header to use for the resolution of the charset encoding.
241      * @param lenient indicates if the charset encoding detection should be relaxed.
242      * @throws IOException thrown if there is a problem reading the file.
243      * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
244      *
245      */

246     public XmlReader(InputStream is,String JavaDoc httpContentType,boolean lenient) throws IOException, XmlReaderException {
247         try {
248             doHttpStream(is,httpContentType,lenient);
249         }
250         catch (XmlReaderException ex) {
251             if (!lenient) {
252                 throw ex;
253             }
254             else {
255                 doLenientDetection(httpContentType,ex);
256             }
257         }
258     }
259
260     private void doLenientDetection(String JavaDoc httpContentType,XmlReaderException ex) throws IOException {
261         if (httpContentType!=null) {
262             if (httpContentType.startsWith("text/html")) {
263                 httpContentType = httpContentType.substring("text/html".length());
264                 httpContentType = "text/xml" + httpContentType;
265                 try {
266                     doHttpStream(ex.getInputStream(),httpContentType,true);
267                     ex = null;
268                 }
269                 catch (XmlReaderException ex2) {
270                     ex = ex2;
271                 }
272             }
273         }
274         if (ex!=null) {
275             String JavaDoc encoding = ex.getXmlEncoding();
276             if (encoding==null) {
277                 encoding = ex.getContentTypeEncoding();
278             }
279             if (encoding==null) {
280                 encoding = UTF_8;
281             }
282             prepareReader(ex.getInputStream(),encoding);
283         }
284     }
285
286     /**
287      * Returns the charset encoding of the XmlReader.
288      * <p>
289      * @return charset encoding.
290      *
291      */

292     public String JavaDoc getEncoding() {
293         return _encoding;
294     }
295
296     public int read(char[] buf,int offset,int len) throws IOException {
297         return _reader.read(buf,offset,len);
298     }
299
300     /**
301      * Closes the XmlReader stream.
302      * <p>
303      * @throws IOException thrown if there was a problem closing the stream.
304      *
305      */

306     public void close() throws IOException {
307         _reader.close();
308     }
309
310     private void doRawStream(InputStream is,boolean lenient) throws IOException {
311         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
312         String JavaDoc bomEnc = getBOMEncoding(pis);
313         String JavaDoc xmlGuessEnc = getXMLGuessEncoding(pis);
314         String JavaDoc xmlEnc = getXmlProlog(pis,xmlGuessEnc);
315         String JavaDoc encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
316         prepareReader(pis,encoding);
317     }
318
319     private void doHttpStream(InputStream is,String JavaDoc httpContentType,boolean lenient) throws IOException {
320         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
321         String JavaDoc cTMime = getContentTypeMime(httpContentType);
322         String JavaDoc cTEnc = getContentTypeEncoding(httpContentType);
323         String JavaDoc bomEnc = getBOMEncoding(pis);
324         String JavaDoc xmlGuessEnc = getXMLGuessEncoding(pis);
325         String JavaDoc xmlEnc = getXmlProlog(pis,xmlGuessEnc);
326         String JavaDoc encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis,lenient);
327         prepareReader(pis,encoding);
328     }
329
330     private void prepareReader(InputStream is,String JavaDoc encoding) throws IOException {
331         _reader = new InputStreamReader(is,encoding);
332         _encoding = encoding;
333     }
334
335     // InputStream is passed for XmlReaderException creation only
336
private static String JavaDoc calculateRawEncoding(String JavaDoc bomEnc, String JavaDoc xmlGuessEnc, String JavaDoc xmlEnc, InputStream is) throws IOException {
337         String JavaDoc encoding;
338         if (bomEnc==null) {
339             if (xmlGuessEnc==null || xmlEnc==null) {
340                 encoding = UTF_8;
341             }
342             else
343             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
344                 encoding = xmlGuessEnc;
345             }
346             else {
347                 encoding = xmlEnc;
348             }
349         }
350         else
351         if (bomEnc.equals(UTF_8)) {
352             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
353                 throw new XmlReaderException(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}),
354                                              bomEnc,xmlGuessEnc,xmlEnc,is);
355             }
356             if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
357                 throw new XmlReaderException(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}),
358                                              bomEnc,xmlGuessEnc,xmlEnc,is);
359             }
360             encoding = UTF_8;
361         }
362         else
363         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
364             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
365                 throw new IOException(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}));
366             }
367             if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
368                 throw new XmlReaderException(RAW_EX_1.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}),
369                                              bomEnc,xmlGuessEnc,xmlEnc,is);
370             }
371             encoding =bomEnc;
372         }
373         else {
374             throw new XmlReaderException(RAW_EX_2.format(new Object JavaDoc[]{bomEnc,xmlGuessEnc,xmlEnc}),
375                                          bomEnc,xmlGuessEnc,xmlEnc,is);
376         }
377         return encoding;
378     }
379
380     // InputStream is passed for XmlReaderException creation only
381
private static String JavaDoc calculateHttpEncoding(String JavaDoc cTMime, String JavaDoc cTEnc, String JavaDoc bomEnc, String JavaDoc xmlGuessEnc, String JavaDoc xmlEnc, InputStream is,boolean lenient) throws IOException {
382         String JavaDoc encoding;
383         if (lenient & xmlEnc!=null) {
384             encoding = xmlEnc;
385         }
386         else {
387             boolean appXml = isAppXml(cTMime);
388             boolean textXml = isTextXml(cTMime);
389             if (appXml || textXml) {
390                 if (cTEnc==null) {
391                     if (appXml) {
392                         encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
393                     }
394                     else {
395                         encoding = US_ASCII;
396                     }
397                 }
398                 else
399                 if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
400                     throw new XmlReaderException(HTTP_EX_1.format(new Object JavaDoc[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
401                                                  cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
402                 }
403                 else
404                 if (cTEnc.equals(UTF_16)) {
405                     if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
406                         encoding = bomEnc;
407                     }
408                     else {
409                         throw new XmlReaderException(HTTP_EX_2.format(new Object JavaDoc[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
410                                                      cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
411                     }
412                 }
413                 else {
414                     encoding = cTEnc;
415                 }
416             }
417             else {
418                 throw new XmlReaderException(HTTP_EX_3.format(new Object JavaDoc[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
419                                              cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
420             }
421         }
422         return encoding;
423     }
424
425     // returns MIME type or NULL if httpContentType is NULL
426
private static String JavaDoc getContentTypeMime(String JavaDoc httpContentType) {
427         String JavaDoc mime = null;
428         if (httpContentType!=null) {
429             int i = httpContentType.indexOf(";");
430             mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
431         }
432         return mime;
433     }
434
435     private static final Pattern JavaDoc CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
436
437     // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
438
private static String JavaDoc getContentTypeEncoding(String JavaDoc httpContentType) {
439         String JavaDoc encoding = null;
440         if (httpContentType!=null) {
441             int i = httpContentType.indexOf(";");
442             if (i>-1) {
443                 String JavaDoc postMime = httpContentType.substring(i+1);
444                 Matcher JavaDoc m = CHARSET_PATTERN.matcher(postMime);
445                 encoding = (m.find()) ? m.group(1) : null;
446                 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
447             }
448         }
449         return encoding;
450     }
451
452     // returns the BOM in the stream, NULL if not present,
453
// if there was BOM the in the stream it is consumed
454
private static String JavaDoc getBOMEncoding(PushbackInputStream is) throws IOException {
455         String JavaDoc encoding = null;
456         int[] bytes = new int[3];
457         bytes[0] = is.read();
458         bytes[1] = is.read();
459         bytes[2] = is.read();
460
461         if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
462             encoding = UTF_16BE;
463             is.unread(bytes[2]);
464         }
465         else
466         if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
467             encoding = UTF_16LE;
468             is.unread(bytes[2]);
469         }
470         else
471         if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
472             encoding = UTF_8;
473         }
474         else {
475             for (int i=bytes.length-1;i>=0;i--) {
476                 is.unread(bytes[i]);
477             }
478         }
479         return encoding;
480     }
481
482     // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
483
private static String JavaDoc getXMLGuessEncoding(PushbackInputStream is) throws IOException {
484         String JavaDoc encoding = null;
485         int[] bytes = new int[4];
486         bytes[0] = is.read();
487         bytes[1] = is.read();
488         bytes[2] = is.read();
489         bytes[3] = is.read();
490         for (int i=bytes.length-1;i>=0;i--) {
491             is.unread(bytes[i]);
492         }
493
494         if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
495                 encoding = UTF_16BE;
496         }
497         else
498         if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
499                 encoding = UTF_16LE;
500         }
501         else
502         if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
503             encoding = UTF_8;
504         }
505         return encoding;
506     }
507
508     private static final Pattern JavaDoc ENCODING_PATTERN = Pattern.compile("<\\?xml.*encoding=\"(.[^\"]*)\".*\\?>");
509
510     // returns the encoding declared in the <?xml encoding=...?>, NULL if none
511
private static String JavaDoc getXmlProlog(PushbackInputStream is,String JavaDoc guessedEnc) throws IOException {
512         String JavaDoc encoding = null;
513         if (guessedEnc!=null) {
514             byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
515             int offset = 0;
516             int max = PUSHBACK_MAX_SIZE;
517             int c = is.read(bytes,offset,max);
518             while (c!=-1 && offset<PUSHBACK_MAX_SIZE) {
519                 offset += c;
520                 max -= c;
521                 c = is.read(bytes,offset,max);
522             }
523             int bytesRead = offset;
524             if (bytesRead>0) {
525                 is.unread(bytes,0,bytesRead);
526                 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc);
527                 BufferedReader br = new BufferedReader(reader);
528                 String JavaDoc prolog = br.readLine();
529                 Matcher JavaDoc m = ENCODING_PATTERN.matcher(prolog);
530                 encoding = (m.find()) ? m.group(1).toUpperCase() : null;
531             }
532         }
533         return encoding;
534     }
535
536     // indicates if the MIME type belongs to the APPLICATION XML family
537
private static boolean isAppXml(String JavaDoc mime) {
538         return mime!=null &&
539                (mime.equals("application/xml") ||
540                 mime.equals("application/xml-dtd") ||
541                 mime.equals("application/xml-external-parsed-entity") ||
542                 (mime.startsWith("application/") && mime.endsWith("+xml")));
543     }
544
545     // indicates if the MIME type belongs to the TEXT XML family
546
private static boolean isTextXml(String JavaDoc mime) {
547         return mime!=null &&
548                (mime.equals("text/xml") ||
549                 mime.equals("text/xml-external-parsed-entity") ||
550                 (mime.startsWith("text/") && mime.endsWith("+xml")));
551     }
552
553     private static final MessageFormat JavaDoc RAW_EX_1 = new MessageFormat JavaDoc(
554             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
555
556     private static final MessageFormat JavaDoc RAW_EX_2 = new MessageFormat JavaDoc(
557             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
558
559     private static final MessageFormat JavaDoc HTTP_EX_1 = new MessageFormat JavaDoc(
560             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
561
562     private static final MessageFormat JavaDoc HTTP_EX_2 = new MessageFormat JavaDoc(
563             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
564
565     private static final MessageFormat JavaDoc HTTP_EX_3 = new MessageFormat JavaDoc(
566             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
567
568 }
569
Popular Tags