KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > xerces > xinclude > XIncludeTextReader


1 /*
2  * Copyright 2003-2005 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.apache.xerces.xinclude;
17
18 import java.io.BufferedInputStream JavaDoc;
19 import java.io.IOException JavaDoc;
20 import java.io.InputStream JavaDoc;
21 import java.io.InputStreamReader JavaDoc;
22 import java.io.Reader JavaDoc;
23 import java.net.HttpURLConnection JavaDoc;
24 import java.net.URL JavaDoc;
25 import java.net.URLConnection JavaDoc;
26 import java.util.Iterator JavaDoc;
27 import java.util.Locale JavaDoc;
28 import java.util.Map JavaDoc;
29
30 import org.apache.xerces.impl.XMLEntityManager;
31 import org.apache.xerces.impl.XMLErrorReporter;
32 import org.apache.xerces.impl.io.ASCIIReader;
33 import org.apache.xerces.impl.io.UTF8Reader;
34 import org.apache.xerces.impl.msg.XMLMessageFormatter;
35 import org.apache.xerces.util.EncodingMap;
36 import org.apache.xerces.util.HTTPInputSource;
37 import org.apache.xerces.util.MessageFormatter;
38 import org.apache.xerces.util.XMLChar;
39 import org.apache.xerces.xni.XMLString;
40 import org.apache.xerces.xni.parser.XMLInputSource;
41
42 /**
43  * This class is used for reading resources requested in <include> elements,
44  * when the parse attribute of the <include> element is "text". Using this
45  * class will open the location, detect the encoding, and discard the byte order
46  * mark, if applicable.
47  *
48  * REVISIT:
49  * Much of the code in this class is taken from XMLEntityManager. It would be nice
50  * if this code could be shared in some way. However, since XMLEntityManager is used
51  * for reading files as XML, and this needs to read files as text, there would need
52  * to be some refactoring done.
53  *
54  * @author Michael Glavassevich, IBM
55  * @author Peter McCracken, IBM
56  * @author Ankit Pasricha, IBM
57  * @author Arun Yadav, Sun Microsystems Inc.
58  *
59  * @version $Id: XIncludeTextReader.java,v 1.15 2005/05/08 18:21:08 mrglavas Exp $
60  *
61  * @see XIncludeHandler
62  */

63 public class XIncludeTextReader {
64
65     private Reader JavaDoc fReader;
66     private XIncludeHandler fHandler;
67     private XMLInputSource fSource;
68     private XMLErrorReporter fErrorReporter;
69     private XMLString fTempString = new XMLString();
70  
71     /**
72      * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
73      *
74      * @param source The XMLInputSource to use.
75      * @param handler The XIncludeHandler to use.
76      * @param bufferSize The size of this text reader's buffer.
77      */

78     public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize)
79         throws IOException JavaDoc {
80         fHandler = handler;
81         fSource = source;
82         fTempString = new XMLString(new char[bufferSize + 1], 0, 0);
83     }
84     
85     /**
86      * Sets the XMLErrorReporter used for reporting errors while
87      * reading the text include.
88      *
89      * @param errorReporter the XMLErrorReporter to be used for
90      * reporting errors.
91      */

92     public void setErrorReporter(XMLErrorReporter errorReporter) {
93         fErrorReporter = errorReporter;
94     }
95
96     /**
97      * Return the Reader for given XMLInputSource.
98      *
99      * @param source The XMLInputSource to use.
100      */

101     protected Reader JavaDoc getReader(XMLInputSource source) throws IOException JavaDoc {
102         if (source.getCharacterStream() != null) {
103             return source.getCharacterStream();
104         }
105         else {
106             InputStream JavaDoc stream = null;
107
108             String JavaDoc encoding = source.getEncoding();
109             if (encoding == null) {
110                 encoding = "UTF-8";
111             }
112             if (source.getByteStream() != null) {
113                 stream = source.getByteStream();
114                 // Wrap the InputStream so that it is possible to rewind it.
115
if (!(stream instanceof BufferedInputStream JavaDoc)) {
116                     stream = new BufferedInputStream JavaDoc(stream, fTempString.ch.length);
117                 }
118             }
119             else {
120                 String JavaDoc expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
121
122                 URL JavaDoc url = new URL JavaDoc(expandedSystemId);
123                 URLConnection JavaDoc urlCon = url.openConnection();
124                 
125                 // If this is an HTTP connection attach any request properties to the request.
126
if (urlCon instanceof HttpURLConnection JavaDoc && source instanceof HTTPInputSource) {
127                     final HttpURLConnection JavaDoc urlConnection = (HttpURLConnection JavaDoc) urlCon;
128                     final HTTPInputSource httpInputSource = (HTTPInputSource) source;
129                     
130                     // set request properties
131
Iterator JavaDoc propIter = httpInputSource.getHTTPRequestProperties();
132                     while (propIter.hasNext()) {
133                         Map.Entry JavaDoc entry = (Map.Entry JavaDoc) propIter.next();
134                         urlConnection.setRequestProperty((String JavaDoc) entry.getKey(), (String JavaDoc) entry.getValue());
135                     }
136                     
137                     // set preference for redirection
138
boolean followRedirects = httpInputSource.getFollowHTTPRedirects();
139                     if (!followRedirects) {
140                         XMLEntityManager.setInstanceFollowRedirects(urlConnection, followRedirects);
141                     }
142                 }
143                 
144                 // Wrap the InputStream so that it is possible to rewind it.
145
stream = new BufferedInputStream JavaDoc(urlCon.getInputStream());
146                 
147                 // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
148
String JavaDoc rawContentType = urlCon.getContentType();
149                 
150                 // text/xml and application/xml offer only one optional parameter
151
int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
152
153                 String JavaDoc contentType = null;
154                 String JavaDoc charset = null;
155                 if (index != -1) {
156                     // this should be something like "text/xml"
157
contentType = rawContentType.substring(0, index).trim();
158
159                     // this should be something like "charset=UTF-8", but we want to
160
// strip it down to just "UTF-8"
161
charset = rawContentType.substring(index + 1).trim();
162                     if (charset.startsWith("charset=")) {
163                         // 8 is the length of "charset="
164
charset = charset.substring(8).trim();
165                         // strip quotes, if present
166
if ((charset.charAt(0) == '"'
167                             && charset.charAt(charset.length() - 1) == '"')
168                             || (charset.charAt(0) == '\''
169                                 && charset.charAt(charset.length() - 1)
170                                     == '\'')) {
171                             charset =
172                                 charset.substring(1, charset.length() - 1);
173                         }
174                     }
175                     else {
176                         charset = null;
177                     }
178                 }
179                 else {
180                     contentType = rawContentType.trim();
181                 }
182
183                 String JavaDoc detectedEncoding = null;
184                 /** The encoding of such a resource is determined by:
185                     1 external encoding information, if available, otherwise
186                          -- the most common type of external information is the "charset" parameter of a MIME package
187                     2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
188                     3 the value of the encoding attribute if one exists, otherwise
189                     4 UTF-8.
190                  **/

191                 if (contentType.equals("text/xml")) {
192                     if (charset != null) {
193                         detectedEncoding = charset;
194                     }
195                     else {
196                         // see RFC2376 or 3023, section 3.1
197
detectedEncoding = "US-ASCII";
198                     }
199                 }
200                 else if (contentType.equals("application/xml")) {
201                     if (charset != null) {
202                         detectedEncoding = charset;
203                     }
204                     else {
205                         // see RFC2376 or 3023, section 3.2
206
detectedEncoding = getEncodingName(stream);
207                     }
208                 }
209                 else if (contentType.endsWith("+xml")) {
210                     detectedEncoding = getEncodingName(stream);
211                 }
212
213                 if (detectedEncoding != null) {
214                     encoding = detectedEncoding;
215                 }
216                 // else 3 or 4.
217
}
218             
219             encoding = encoding.toUpperCase(Locale.ENGLISH);
220             
221             // eat the Byte Order Mark
222
encoding = consumeBOM(stream, encoding);
223             
224             // If the document is UTF-8 or US-ASCII use
225
// the Xerces readers for these encodings. For
226
// US-ASCII consult the encoding map since
227
// this encoding has many aliases.
228
if (encoding.equals("UTF-8")) {
229                 return new UTF8Reader(stream,
230                     fTempString.ch.length,
231                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
232                     fErrorReporter.getLocale() );
233             }
234             
235             // Try to use a Java reader.
236
String JavaDoc javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
237             
238             // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
239
// The XIncludeHandler will report this as a ResourceError and then will
240
// attempt to include a fallback if there is one.
241
if (javaEncoding == null) {
242                 MessageFormatter aFormatter =
243                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
244                 Locale JavaDoc aLocale = fErrorReporter.getLocale();
245                 throw new IOException JavaDoc( aFormatter.formatMessage( aLocale,
246                     "EncodingDeclInvalid",
247                     new Object JavaDoc[] {encoding} ) );
248             }
249             else if (javaEncoding.equals("ASCII")) {
250                 return new ASCIIReader(stream,
251                     fTempString.ch.length,
252                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
253                     fErrorReporter.getLocale() );
254             }
255             
256             return new InputStreamReader JavaDoc(stream, javaEncoding);
257         }
258     }
259
260     /**
261      * XMLEntityManager cares about endian-ness, since it creates its own optimized
262      * readers. Since we're just using generic Java readers for now, we're not caring
263      * about endian-ness. If this changes, even more code needs to be copied from
264      * XMLEntity manager. -- PJM
265      */

266     protected String JavaDoc getEncodingName(InputStream JavaDoc stream) throws IOException JavaDoc {
267         final byte[] b4 = new byte[4];
268         String JavaDoc encoding = null;
269
270         // this has the potential to throw an exception
271
// it will be fixed when we ensure the stream is rewindable (see note above)
272
stream.mark(4);
273         int count = stream.read(b4, 0, 4);
274         stream.reset();
275         if (count == 4) {
276             encoding = getEncodingName(b4);
277         }
278
279         return encoding;
280     }
281
282     /**
283      * Removes the byte order mark from the stream, if
284      * it exists and returns the encoding name.
285      *
286      * @param stream
287      * @param encoding
288      * @throws IOException
289      */

290     protected String JavaDoc consumeBOM(InputStream JavaDoc stream, String JavaDoc encoding)
291         throws IOException JavaDoc {
292
293         byte[] b = new byte[3];
294         int count = 0;
295         stream.mark(3);
296         if (encoding.equals("UTF-8")) {
297             count = stream.read(b, 0, 3);
298             if (count == 3) {
299                 final int b0 = b[0] & 0xFF;
300                 final int b1 = b[1] & 0xFF;
301                 final int b2 = b[2] & 0xFF;
302                 if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
303                     // First three bytes are not BOM, so reset.
304
stream.reset();
305                 }
306             }
307             else {
308                 stream.reset();
309             }
310         }
311         else if (encoding.startsWith("UTF-16")) {
312             count = stream.read(b, 0, 2);
313             if (count == 2) {
314                 final int b0 = b[0] & 0xFF;
315                 final int b1 = b[1] & 0xFF;
316                 if (b0 == 0xFE && b1 == 0xFF) {
317                     return "UTF-16BE";
318                 }
319                 else if (b0 == 0xFF && b1 == 0xFE) {
320                     return "UTF-16LE";
321                 }
322             }
323             // First two bytes are not BOM, so reset.
324
stream.reset();
325         }
326         // We could do UTF-32, but since the getEncodingName() doesn't support that
327
// we won't support it here.
328
// To implement UTF-32, look for: 00 00 FE FF for big-endian
329
// or FF FE 00 00 for little-endian
330
return encoding;
331     }
332
333     /**
334      * REVISIT: This code is taken from org.apache.xerces.impl.XMLEntityManager.
335      * Is there any way we can share the code, without having it implemented twice?
336      * I think we should make it public and static in XMLEntityManager. --PJM
337      *
338      * Returns the IANA encoding name that is auto-detected from
339      * the bytes specified, with the endian-ness of that encoding where appropriate.
340      *
341      * @param b4 The first four bytes of the input.
342      * @return the encoding name, or null if no encoding could be detected
343      */

344     protected String JavaDoc getEncodingName(byte[] b4) {
345
346         // UTF-16, with BOM
347
int b0 = b4[0] & 0xFF;
348         int b1 = b4[1] & 0xFF;
349         if (b0 == 0xFE && b1 == 0xFF) {
350             // UTF-16, big-endian
351
return "UTF-16BE";
352         }
353         if (b0 == 0xFF && b1 == 0xFE) {
354             // UTF-16, little-endian
355
return "UTF-16LE";
356         }
357
358         // UTF-8 with a BOM
359
int b2 = b4[2] & 0xFF;
360         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
361             return "UTF-8";
362         }
363
364         // other encodings
365
int b3 = b4[3] & 0xFF;
366         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
367             // UCS-4, big endian (1234)
368
return "ISO-10646-UCS-4";
369         }
370         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
371             // UCS-4, little endian (4321)
372
return "ISO-10646-UCS-4";
373         }
374         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
375             // UCS-4, unusual octet order (2143)
376
return "ISO-10646-UCS-4";
377         }
378         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
379             // UCS-4, unusual octect order (3412)
380
return "ISO-10646-UCS-4";
381         }
382         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
383             // UTF-16, big-endian, no BOM
384
// (or could turn out to be UCS-2...
385
return "UTF-16BE";
386         }
387         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
388             // UTF-16, little-endian, no BOM
389
// (or could turn out to be UCS-2...
390
return "UTF-16LE";
391         }
392         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
393             // EBCDIC
394
// a la xerces1, return CP037 instead of EBCDIC here
395
return "CP037";
396         }
397
398         // this signals us to use the value from the encoding attribute
399
return null;
400
401     } // getEncodingName(byte[]):Object[]
402

403     /**
404      * Read the input stream as text, and pass the text on to the XIncludeHandler
405      * using calls to characters(). This will read all of the text it can from the
406      * resource.
407      *
408      * @throws IOException
409      */

410     public void parse() throws IOException JavaDoc {
411         
412         fReader = getReader(fSource);
413         fSource = null;
414         int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
415         while (readSize != -1) {
416             for (int i = 0; i < readSize; ++i) {
417                 char ch = fTempString.ch[i];
418                 if (!isValid(ch)) {
419                     if (XMLChar.isHighSurrogate(ch)) {
420                         int ch2;
421                         // retrieve next character
422
if (++i < readSize) {
423                             ch2 = fTempString.ch[i];
424                         }
425                         // handle rare boundary case
426
else {
427                             ch2 = fReader.read();
428                             if (ch2 != -1) {
429                                 fTempString.ch[readSize++] = (char) ch2;
430                             }
431                         }
432                         if (XMLChar.isLowSurrogate(ch2)) {
433                             // convert surrogates to a supplemental character
434
int sup = XMLChar.supplemental(ch, (char)ch2);
435                             if (!isValid(sup)) {
436                                 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
437                                                            "InvalidCharInContent",
438                                                            new Object JavaDoc[] { Integer.toString(sup, 16) },
439                                                            XMLErrorReporter.SEVERITY_FATAL_ERROR);
440                             }
441                         }
442                         else {
443                             fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
444                                                        "InvalidCharInContent",
445                                                        new Object JavaDoc[] { Integer.toString(ch2, 16) },
446                                                        XMLErrorReporter.SEVERITY_FATAL_ERROR);
447                         }
448                     }
449                     else {
450                         fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
451                                                    "InvalidCharInContent",
452                                                    new Object JavaDoc[] { Integer.toString(ch, 16) },
453                                                    XMLErrorReporter.SEVERITY_FATAL_ERROR);
454                     }
455                 }
456             }
457             if (fHandler != null && readSize > 0) {
458                 fTempString.offset = 0;
459                 fTempString.length = readSize;
460                 fHandler.characters(
461                     fTempString,
462                     fHandler.modifyAugmentations(null, true));
463             }
464             readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
465         }
466         
467     }
468     
469     /**
470      * Sets the input source on this text reader.
471      *
472      * @param source The XMLInputSource to use.
473      */

474     public void setInputSource(XMLInputSource source) {
475         fSource = source;
476     }
477     
478     /**
479      * Closes the stream. Call this after parse(), or when there is no longer any need
480      * for this object.
481      *
482      * @throws IOException
483      */

484     public void close() throws IOException JavaDoc {
485         if (fReader != null) {
486             fReader.close();
487             fReader = null;
488         }
489     }
490     
491     /**
492      * Returns true if the specified character is a valid XML character
493      * as per the rules of XML 1.0.
494      *
495      * @param ch The character to check.
496      */

497     protected boolean isValid(int ch) {
498         return XMLChar.isValid(ch);
499     }
500     
501     /**
502      * Sets the buffer size property for the reader which decides the chunk sizes that are parsed
503      * by the reader at a time and passed to the handler
504      *
505      * @param bufferSize The size of the buffer desired
506      */

507     protected void setBufferSize(int bufferSize) {
508         if (fTempString.ch.length != ++bufferSize) {
509             fTempString.ch = new char[bufferSize];
510         }
511     }
512  
513 }
Popular Tags