XIncludeTextReader


1   /*
2    * Copyright 2003-2005 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.apache.xerces.xinclude;
17  
18  import java.io.BufferedInputStream  ;
19  import java.io.IOException  ;
20  import java.io.InputStream  ;
21  import java.io.InputStreamReader  ;
22  import java.io.Reader  ;
23  import java.net.HttpURLConnection  ;
24  import java.net.URL  ;
25  import java.net.URLConnection  ;
26  import java.util.Iterator  ;
27  import java.util.Locale  ;
28  import java.util.Map  ;
29  
30  import org.apache.xerces.impl.XMLEntityManager;
31  import org.apache.xerces.impl.XMLErrorReporter;
32  import org.apache.xerces.impl.io.ASCIIReader;
33  import org.apache.xerces.impl.io.UTF8Reader;
34  import org.apache.xerces.impl.msg.XMLMessageFormatter;
35  import org.apache.xerces.util.EncodingMap;
36  import org.apache.xerces.util.HTTPInputSource;
37  import org.apache.xerces.util.MessageFormatter;
38  import org.apache.xerces.util.XMLChar;
39  import org.apache.xerces.xni.XMLString;
40  import org.apache.xerces.xni.parser.XMLInputSource;
41  
42  /**
43   * This class is used for reading resources requested in &lt;include&gt; elements,
44   * when the parse attribute of the &lt;include&gt; element is "text".  Using this
45   * class will open the location, detect the encoding, and discard the byte order
46   * mark, if applicable.
47   * 
48   * REVISIT:
49   * Much of the code in this class is taken from XMLEntityManager.  It would be nice
50   * if this code could be shared in some way.  However, since XMLEntityManager is used
51   * for reading files as XML, and this needs to read files as text, there would need
52   * to be some refactoring done.
53   * 
54   * @author Michael Glavassevich, IBM
55   * @author Peter McCracken, IBM
56   * @author Ankit Pasricha, IBM
57   * @author Arun Yadav, Sun Microsystems Inc.
58   *
59   * @version $Id: XIncludeTextReader.java,v 1.15 2005/05/08 18:21:08 mrglavas Exp $
60   *
61   * @see XIncludeHandler
62   */
63  public class XIncludeTextReader {
64  
65      private Reader   fReader;
66      private XIncludeHandler fHandler;
67      private XMLInputSource fSource;
68      private XMLErrorReporter fErrorReporter;
69      private XMLString fTempString = new XMLString();
70   
71      /**
72       * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
73       *
74       * @param source The XMLInputSource to use.
75       * @param handler The XIncludeHandler to use.
76       * @param bufferSize The size of this text reader's buffer.
77       */
78      public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize)
79          throws IOException   {
80          fHandler = handler;
81          fSource = source;
82          fTempString = new XMLString(new char[bufferSize + 1], 0, 0);
83      }
84      
85      /**
86       * Sets the XMLErrorReporter used for reporting errors while
87       * reading the text include.
88       *
89       * @param errorReporter the XMLErrorReporter to be used for
90       * reporting errors.
91       */
92      public void setErrorReporter(XMLErrorReporter errorReporter) {
93          fErrorReporter = errorReporter;
94      }
95  
96      /**
97       * Return the Reader for given XMLInputSource.
98       *
99       * @param source The XMLInputSource to use.
100      */
101     protected Reader   getReader(XMLInputSource source) throws IOException   {
102         if (source.getCharacterStream() != null) {
103             return source.getCharacterStream();
104         }
105         else {
106             InputStream   stream = null;
107 
108             String   encoding = source.getEncoding();
109             if (encoding == null) {
110                 encoding = "UTF-8";
111             }
112             if (source.getByteStream() != null) {
113                 stream = source.getByteStream();
114                 // Wrap the InputStream so that it is possible to rewind it.
115                 if (!(stream instanceof BufferedInputStream  )) {
116                     stream = new BufferedInputStream  (stream, fTempString.ch.length);
117                 }
118             }
119             else {
120                 String   expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
121 
122                 URL   url = new URL  (expandedSystemId);
123                 URLConnection   urlCon = url.openConnection();
124                 
125                 // If this is an HTTP connection attach any request properties to the request.
126                 if (urlCon instanceof HttpURLConnection   && source instanceof HTTPInputSource) {
127                     final HttpURLConnection   urlConnection = (HttpURLConnection  ) urlCon;
128                     final HTTPInputSource httpInputSource = (HTTPInputSource) source;
129                     
130                     // set request properties
131                     Iterator   propIter = httpInputSource.getHTTPRequestProperties();
132                     while (propIter.hasNext()) {
133                         Map.Entry   entry = (Map.Entry  ) propIter.next();
134                         urlConnection.setRequestProperty((String  ) entry.getKey(), (String  ) entry.getValue());
135                     }
136                     
137                     // set preference for redirection
138                     boolean followRedirects = httpInputSource.getFollowHTTPRedirects();
139                     if (!followRedirects) {
140                         XMLEntityManager.setInstanceFollowRedirects(urlConnection, followRedirects);
141                     }
142                 }
143                 
144                 // Wrap the InputStream so that it is possible to rewind it.
145                 stream = new BufferedInputStream  (urlCon.getInputStream());
146                 
147                 // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
148                 String   rawContentType = urlCon.getContentType();
149                 
150                 // text/xml and application/xml offer only one optional parameter
151                 int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
152 
153                 String   contentType = null;
154                 String   charset = null;
155                 if (index != -1) {
156                     // this should be something like "text/xml"
157                     contentType = rawContentType.substring(0, index).trim();
158 
159                     // this should be something like "charset=UTF-8", but we want to
160                     // strip it down to just "UTF-8"
161                     charset = rawContentType.substring(index + 1).trim();
162                     if (charset.startsWith("charset=")) {
163                         // 8 is the length of "charset="
164                         charset = charset.substring(8).trim();
165                         // strip quotes, if present
166                         if ((charset.charAt(0) == '"'
167                             && charset.charAt(charset.length() - 1) == '"')
168                             || (charset.charAt(0) == '\''
169                                 && charset.charAt(charset.length() - 1)
170                                     == '\'')) {
171                             charset =
172                                 charset.substring(1, charset.length() - 1);
173                         }
174                     }
175                     else {
176                         charset = null;
177                     }
178                 }
179                 else {
180                     contentType = rawContentType.trim();
181                 }
182 
183                 String   detectedEncoding = null;
184                 /**  The encoding of such a resource is determined by:
185                     1 external encoding information, if available, otherwise
186                          -- the most common type of external information is the "charset" parameter of a MIME package
187                     2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
188                     3 the value of the encoding attribute if one exists, otherwise
189                     4 UTF-8.
190                  **/
191                 if (contentType.equals("text/xml")) {
192                     if (charset != null) {
193                         detectedEncoding = charset;
194                     }
195                     else {
196                         // see RFC2376 or 3023, section 3.1
197                         detectedEncoding = "US-ASCII";
198                     }
199                 }
200                 else if (contentType.equals("application/xml")) {
201                     if (charset != null) {
202                         detectedEncoding = charset;
203                     }
204                     else {
205                         // see RFC2376 or 3023, section 3.2
206                         detectedEncoding = getEncodingName(stream);
207                     }
208                 }
209                 else if (contentType.endsWith("+xml")) {
210                     detectedEncoding = getEncodingName(stream);
211                 }
212 
213                 if (detectedEncoding != null) {
214                     encoding = detectedEncoding;
215                 }
216                 // else 3 or 4.
217             }
218             
219             encoding = encoding.toUpperCase(Locale.ENGLISH);
220             
221             // eat the Byte Order Mark
222             encoding = consumeBOM(stream, encoding);
223             
224             // If the document is UTF-8 or US-ASCII use 
225             // the Xerces readers for these encodings. For
226             // US-ASCII consult the encoding map since
227             // this encoding has many aliases.
228             if (encoding.equals("UTF-8")) {
229                 return new UTF8Reader(stream, 
230                     fTempString.ch.length, 
231                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 
232                     fErrorReporter.getLocale() );
233             }
234             
235             // Try to use a Java reader.
236             String   javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
237             
238             // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
239             // The XIncludeHandler will report this as a ResourceError and then will
240             // attempt to include a fallback if there is one.
241             if (javaEncoding == null) {
242                 MessageFormatter aFormatter = 
243                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
244                 Locale   aLocale = fErrorReporter.getLocale();
245                 throw new IOException  ( aFormatter.formatMessage( aLocale, 
246                     "EncodingDeclInvalid", 
247                     new Object  [] {encoding} ) );
248             }
249             else if (javaEncoding.equals("ASCII")) {
250                 return new ASCIIReader(stream,
251                     fTempString.ch.length,
252                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 
253                     fErrorReporter.getLocale() );
254             }
255             
256             return new InputStreamReader  (stream, javaEncoding);
257         }
258     }
259 
260     /** 
261      * XMLEntityManager cares about endian-ness, since it creates its own optimized
262      * readers. Since we're just using generic Java readers for now, we're not caring
263      * about endian-ness.  If this changes, even more code needs to be copied from
264      * XMLEntity manager. -- PJM
265      */
266     protected String   getEncodingName(InputStream   stream) throws IOException   {
267         final byte[] b4 = new byte[4];
268         String   encoding = null;
269 
270         // this has the potential to throw an exception
271         // it will be fixed when we ensure the stream is rewindable (see note above)
272         stream.mark(4);
273         int count = stream.read(b4, 0, 4);
274         stream.reset();
275         if (count == 4) {
276             encoding = getEncodingName(b4);
277         }
278 
279         return encoding;
280     }
281 
282     /**
283      * Removes the byte order mark from the stream, if
284      * it exists and returns the encoding name.
285      * 
286      * @param stream
287      * @param encoding
288      * @throws IOException
289      */
290     protected String   consumeBOM(InputStream   stream, String   encoding)
291         throws IOException   {
292 
293         byte[] b = new byte[3];
294         int count = 0;
295         stream.mark(3);
296         if (encoding.equals("UTF-8")) {
297             count = stream.read(b, 0, 3);
298             if (count == 3) {
299                 final int b0 = b[0] & 0xFF; 
300                 final int b1 = b[1] & 0xFF;
301                 final int b2 = b[2] & 0xFF;
302                 if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
303                     // First three bytes are not BOM, so reset.
304                     stream.reset();
305                 }
306             }
307             else {
308                 stream.reset();
309             }
310         }
311         else if (encoding.startsWith("UTF-16")) {
312             count = stream.read(b, 0, 2);
313             if (count == 2) {
314                 final int b0 = b[0] & 0xFF;
315                 final int b1 = b[1] & 0xFF;
316                 if (b0 == 0xFE && b1 == 0xFF) {
317                     return "UTF-16BE";
318                 }
319                 else if (b0 == 0xFF && b1 == 0xFE) {
320                     return "UTF-16LE";
321                 }
322             }
323             // First two bytes are not BOM, so reset.
324             stream.reset();
325         }
326         // We could do UTF-32, but since the getEncodingName() doesn't support that
327         // we won't support it here.
328         // To implement UTF-32, look for:  00 00 FE FF for big-endian
329         //                             or  FF FE 00 00 for little-endian
330         return encoding;
331     }
332 
333     /**
334      * REVISIT: This code is taken from org.apache.xerces.impl.XMLEntityManager.
335      *          Is there any way we can share the code, without having it implemented twice?
336      *          I think we should make it public and static in XMLEntityManager. --PJM
337      *
338      * Returns the IANA encoding name that is auto-detected from
339      * the bytes specified, with the endian-ness of that encoding where appropriate.
340      *
341      * @param b4    The first four bytes of the input.
342      * @return the encoding name, or null if no encoding could be detected
343      */
344     protected String   getEncodingName(byte[] b4) {
345 
346         // UTF-16, with BOM
347         int b0 = b4[0] & 0xFF;
348         int b1 = b4[1] & 0xFF;
349         if (b0 == 0xFE && b1 == 0xFF) {
350             // UTF-16, big-endian
351             return "UTF-16BE";
352         }
353         if (b0 == 0xFF && b1 == 0xFE) {
354             // UTF-16, little-endian
355             return "UTF-16LE";
356         }
357 
358         // UTF-8 with a BOM
359         int b2 = b4[2] & 0xFF;
360         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
361             return "UTF-8";
362         }
363 
364         // other encodings
365         int b3 = b4[3] & 0xFF;
366         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
367             // UCS-4, big endian (1234)
368             return "ISO-10646-UCS-4";
369         }
370         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
371             // UCS-4, little endian (4321)
372             return "ISO-10646-UCS-4";
373         }
374         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
375             // UCS-4, unusual octet order (2143)
376             return "ISO-10646-UCS-4";
377         }
378         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
379             // UCS-4, unusual octect order (3412)
380             return "ISO-10646-UCS-4";
381         }
382         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
383             // UTF-16, big-endian, no BOM
384             // (or could turn out to be UCS-2...
385             return "UTF-16BE";
386         }
387         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
388             // UTF-16, little-endian, no BOM
389             // (or could turn out to be UCS-2...
390             return "UTF-16LE";
391         }
392         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
393             // EBCDIC
394             // a la xerces1, return CP037 instead of EBCDIC here
395             return "CP037";
396         }
397 
398         // this signals us to use the value from the encoding attribute
399         return null;
400 
401     } // getEncodingName(byte[]):Object[]
402 
403     /**
404      * Read the input stream as text, and pass the text on to the XIncludeHandler
405      * using calls to characters().  This will read all of the text it can from the
406      * resource.
407      * 
408      * @throws IOException
409      */
410     public void parse() throws IOException   {
411         
412         fReader = getReader(fSource);
413         fSource = null;
414         int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
415         while (readSize != -1) {
416             for (int i = 0; i < readSize; ++i) {
417                 char ch = fTempString.ch[i];
418                 if (!isValid(ch)) {
419                     if (XMLChar.isHighSurrogate(ch)) {
420                         int ch2;
421                         // retrieve next character
422                         if (++i < readSize) {
423                             ch2 = fTempString.ch[i];
424                         }
425                         // handle rare boundary case
426                         else {
427                             ch2 = fReader.read();
428                             if (ch2 != -1) {
429                                 fTempString.ch[readSize++] = (char) ch2;
430                             }
431                         }
432                         if (XMLChar.isLowSurrogate(ch2)) {
433                             // convert surrogates to a supplemental character
434                             int sup = XMLChar.supplemental(ch, (char)ch2);
435                             if (!isValid(sup)) {
436                                 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
437                                                            "InvalidCharInContent", 
438                                                            new Object  [] { Integer.toString(sup, 16) },
439                                                            XMLErrorReporter.SEVERITY_FATAL_ERROR);
440                             }
441                         }
442                         else {
443                             fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
444                                                        "InvalidCharInContent", 
445                                                        new Object  [] { Integer.toString(ch2, 16) },
446                                                        XMLErrorReporter.SEVERITY_FATAL_ERROR);
447                         }
448                     }
449                     else {
450                         fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
451                                                    "InvalidCharInContent", 
452                                                    new Object  [] { Integer.toString(ch, 16) },
453                                                    XMLErrorReporter.SEVERITY_FATAL_ERROR);
454                     }
455                 }
456             }
457             if (fHandler != null && readSize > 0) {
458                 fTempString.offset = 0;
459                 fTempString.length = readSize;
460                 fHandler.characters(
461                     fTempString,
462                     fHandler.modifyAugmentations(null, true));
463             }
464             readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
465         }
466         
467     }
468     
469     /**
470      * Sets the input source on this text reader.
471      * 
472      * @param source The XMLInputSource to use.
473      */
474     public void setInputSource(XMLInputSource source) {
475         fSource = source;
476     }
477     
478     /**
479      * Closes the stream.  Call this after parse(), or when there is no longer any need
480      * for this object.
481      * 
482      * @throws IOException
483      */
484     public void close() throws IOException   {
485         if (fReader != null) {
486             fReader.close();
487             fReader = null;
488         }
489     }
490     
491     /**
492      * Returns true if the specified character is a valid XML character
493      * as per the rules of XML 1.0.
494      *
495      * @param ch The character to check.
496      */
497     protected boolean isValid(int ch) {
498         return XMLChar.isValid(ch);
499     }
500     
501     /**
502      * Sets the buffer size property for the reader which decides the chunk sizes that are parsed
503      * by the reader at a time and passed to the handler
504      * 
505      * @param bufferSize The size of the buffer desired
506      */
507     protected void setBufferSize(int bufferSize) {
508         if (fTempString.ch.length != ++bufferSize) {
509             fTempString.ch = new char[bufferSize];
510         }
511     }
512  
513 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags