XIncludeTextReader


1   /*
2    * The Apache Software License, Version 1.1
3    *
4    *
5    * Copyright (c) 2001-2004 The Apache Software Foundation.  All rights
6    * reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer.
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution,
21   *    if any, must include the following acknowledgment:
22   *       "This product includes software developed by the
23   *        Apache Software Foundation (http://www.apache.org/)."
24   *    Alternately, this acknowledgment may appear in the software itself,
25   *    if and wherever such third-party acknowledgments normally appear.
26   *
27   * 4. The names "Xerces" and "Apache Software Foundation" must
28   *    not be used to endorse or promote products derived from this
29   *    software without prior written permission. For written
30   *    permission, please contact apache@apache.org.
31   *
32   * 5. Products derived from this software may not be called "Apache",
33   *    nor may "Apache" appear in their name, without prior written
34   *    permission of the Apache Software Foundation.
35   *
36   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47   * SUCH DAMAGE.
48   * ====================================================================
49   *
50   * This software consists of voluntary contributions made by many
51   * individuals on behalf of the Apache Software Foundation and was
52   * originally based on software copyright (c) 2003, International
53   * Business Machines, Inc., http://www.apache.org.  For more
54   * information on the Apache Software Foundation, please see
55   * <http://www.apache.org/>.
56   */
57  
58  package com.sun.org.apache.xerces.internal.xinclude;
59  
60  import java.io.BufferedInputStream  ;
61  import java.io.IOException  ;
62  import java.io.InputStream  ;
63  import java.io.InputStreamReader  ;
64  import java.io.Reader  ;
65  import java.net.HttpURLConnection  ;
66  import java.net.URL  ;
67  import java.net.URLConnection  ;
68  import java.util.Locale  ;
69  
70  import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
71  import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
72  import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
73  import com.sun.org.apache.xerces.internal.impl.XMLEntityManager;
74  import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter;
75  import com.sun.org.apache.xerces.internal.util.EncodingMap;
76  import com.sun.org.apache.xerces.internal.util.MessageFormatter;
77  import com.sun.org.apache.xerces.internal.util.XMLChar;
78  import com.sun.org.apache.xerces.internal.util.XMLStringBuffer;
79  import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource;
80  
81  /**
82   * This class is used for reading resources requested in &lt;include&gt; elements,
83   * when the parse attribute of the &lt;include&gt; element is "text".  Using this
84   * class will open the location, detect the encoding, and discard the byte order
85   * mark, if applicable.
86   * 
87   * REVISIT:
88   * Much of the code in this class is taken from XMLEntityManager.  It would be nice
89   * if this code could be shared in some way.  However, since XMLEntityManager is used
90   * for reading files as XML, and this needs to read files as text, there would need
91   * to be some refactoring done.
92   * 
93   * @author Michael Glavassevich, IBM
94   * @author Peter McCracken, IBM
95   * @author Arun Yadav, Sun Microsystems Inc.
96   *
97   * @version $Id: XIncludeTextReader.java,v 1.10 2004/04/15 04:51:56 mrglavas Exp $
98   *
99   * @see XIncludeHandler
100  */
101 public class XIncludeTextReader {
102 
103     private Reader   fReader;
104     private XIncludeHandler fHandler;
105     private XMLInputSource fSource;
106     private XMLErrorReporter fErrorReporter;
107     
108     // Content negotation parameters
109     private String   fAccept;
110     private String   fAcceptLanguage;
111  
112     /**
113      * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
114      *
115      * @param source The XMLInputSource to use.
116      * @param handler The XIncludeHandler to use.
117      */
118     public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler)
119         throws IOException   {
120         fHandler = handler;
121         fSource = source;
122     }
123     
124     /**
125      * Sets the XMLErrorReporter used for reporting errors while
126      * reading the text include.
127      *
128      * @param errorReporter the XMLErrorReporter to be used for
129      * reporting errors.
130      */
131     public void setErrorReporter(XMLErrorReporter errorReporter) {
132         fErrorReporter = errorReporter;
133     }
134     
135     /**
136      * Sets content negotation parameters to be attached to an HTTP request.
137      * 
138      * @param accept the Accept HTTP request property
139      * @param acceptLanguage the Accept-Language HTTP request property
140      */
141     public void setHttpProperties(String   accept, String   acceptLanguage) {
142         fAccept = accept;
143         fAcceptLanguage = acceptLanguage;
144     }
145 
146     /**
147      * Return the Reader for given XMLInputSource.
148      *
149      * @param source The XMLInputSource to use.
150      */
151     protected Reader   getReader(XMLInputSource source) throws IOException   {
152         if (source.getCharacterStream() != null) {
153             return source.getCharacterStream();
154         }
155         else {
156             InputStream   stream = null;
157 
158             String   encoding = source.getEncoding();
159             if (encoding == null) {
160                 encoding = "UTF-8";
161             }
162             if (source.getByteStream() != null) {
163                 stream = source.getByteStream();
164                 // Wrap the InputStream so that it is possible to rewind it.
165                 if (!(stream instanceof BufferedInputStream  )) {
166                     stream = new BufferedInputStream  (stream);
167                 }
168             }
169             else {
170                 String   expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
171 
172                 URL   url = new URL  (expandedSystemId);
173                 URLConnection   urlCon = url.openConnection();
174                 
175                 // If this is an HTTP connection attach any 
176                 // content negotation parameters to the request.
177                 if (urlCon instanceof HttpURLConnection  ) {
178                     if( fAccept != null && fAccept.length() > 0) {
179                         urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT, fAccept);
180                     }
181                     if( fAcceptLanguage != null && fAcceptLanguage.length() > 0) {
182                         urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT_LANGUAGE, fAcceptLanguage);
183                     }
184                 }
185                 
186                 // Wrap the InputStream so that it is possible to rewind it.
187                 stream = new BufferedInputStream  (urlCon.getInputStream());
188                 
189                 // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
190                 String   rawContentType = urlCon.getContentType();
191                 
192                 // text/xml and application/xml offer only one optional parameter
193                 int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
194 
195                 String   contentType = null;
196                 String   charset = null;
197                 if (index != -1) {
198                     // this should be something like "text/xml"
199                     contentType = rawContentType.substring(0, index).trim();
200 
201                     // this should be something like "charset=UTF-8", but we want to
202                     // strip it down to just "UTF-8"
203                     charset = rawContentType.substring(index + 1).trim();
204                     if (charset.startsWith("charset=")) {
205                         // 8 is the length of "charset="
206                         charset = charset.substring(8).trim();
207                         // strip quotes, if present
208                         if ((charset.charAt(0) == '"'
209                             && charset.charAt(charset.length() - 1) == '"')
210                             || (charset.charAt(0) == '\''
211                                 && charset.charAt(charset.length() - 1)
212                                     == '\'')) {
213                             charset =
214                                 charset.substring(1, charset.length() - 1);
215                         }
216                     }
217                     else {
218                         charset = null;
219                     }
220                 }
221                 else {
222                     contentType = rawContentType.trim();
223                 }
224 
225                 String   detectedEncoding = null;
226                 /**  The encoding of such a resource is determined by:
227                     1 external encoding information, if available, otherwise
228                          -- the most common type of external information is the "charset" parameter of a MIME package
229                     2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
230                     3 the value of the encoding attribute if one exists, otherwise
231                     4 UTF-8.
232                  **/
233                 if (contentType.equals("text/xml")) {
234                     if (charset != null) {
235                         detectedEncoding = charset;
236                     }
237                     else {
238                         // see RFC2376 or 3023, section 3.1
239                         detectedEncoding = "US-ASCII";
240                     }
241                 }
242                 else if (contentType.equals("application/xml")) {
243                     if (charset != null) {
244                         detectedEncoding = charset;
245                     }
246                     else {
247                         // see RFC2376 or 3023, section 3.2
248                         detectedEncoding = getEncodingName(stream);
249                     }
250                 }
251                 else if (contentType.endsWith("+xml")) {
252                     detectedEncoding = getEncodingName(stream);
253                 }
254 
255                 if (detectedEncoding != null) {
256                     encoding = detectedEncoding;
257                 }
258                 // else 3 or 4.
259             }
260             
261             encoding = encoding.toUpperCase(Locale.ENGLISH);
262             
263             // eat the Byte Order Mark
264             consumeBOM(stream, encoding);
265             
266             // If the document is UTF-8 or US-ASCII use 
267             // the Xerces readers for these encodings. For
268             // US-ASCII consult the encoding map since
269             // this encoding has many aliases.
270             if (encoding.equals("UTF-8")) {
271                 return new UTF8Reader(stream, 
272                     XMLEntityManager.DEFAULT_BUFFER_SIZE, 
273                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 
274                     fErrorReporter.getLocale() );
275             }
276             
277             // Try to use a Java reader.
278             String   javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
279             
280             // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
281             // The XIncludeHandler will report this as a ResourceError and then will
282             // attempt to include a fallback if there is one.
283             if (javaEncoding == null) {
284                 MessageFormatter aFormatter = 
285                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
286                 Locale   aLocale = fErrorReporter.getLocale();
287                 throw new IOException  ( aFormatter.formatMessage( aLocale, 
288                     "EncodingDeclInvalid", 
289                     new Object  [] {encoding} ) );
290             }
291             else if (javaEncoding.equals("ASCII")) {
292                 return new ASCIIReader(stream,
293                     XMLEntityManager.DEFAULT_BUFFER_SIZE,
294                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 
295                     fErrorReporter.getLocale() );
296             }
297             
298             return new InputStreamReader  (stream, javaEncoding);
299         }
300     }
301 
302     /** 
303      * XMLEntityManager cares about endian-ness, since it creates its own optimized
304      * readers. Since we're just using generic Java readers for now, we're not caring
305      * about endian-ness.  If this changes, even more code needs to be copied from
306      * XMLEntity manager. -- PJM
307      */
308     protected String   getEncodingName(InputStream   stream) throws IOException   {
309         final byte[] b4 = new byte[4];
310         String   encoding = null;
311 
312         // this has the potential to throw an exception
313         // it will be fixed when we ensure the stream is rewindable (see note above)
314         stream.mark(4);
315         int count = stream.read(b4, 0, 4);
316         stream.reset();
317         if (count == 4) {
318             encoding = getEncodingName(b4);
319         }
320 
321         return encoding;
322     }
323 
324     /**
325      * Removes the byte order mark from the stream, if it exists.
326      * @param stream
327      * @param encoding
328      * @throws IOException
329      */
330     protected void consumeBOM(InputStream   stream, String   encoding)
331         throws IOException   {
332 
333         byte[] b = new byte[3];
334         int count = 0;
335         stream.mark(3);
336         if (encoding.equals("UTF-8")) {
337             count = stream.read(b, 0, 3);
338             if (count == 3) {
339                 int b0 = b[0] & 0xFF; 
340                 int b1 = b[1] & 0xFF;
341                 int b2 = b[2] & 0xFF;
342                 if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
343                     // First three bytes are not BOM, so reset.
344                     stream.reset();
345                 }
346             }
347             else {
348                 stream.reset();
349             }
350         }
351         else if (encoding.startsWith("UTF-16")) {
352             count = stream.read(b, 0, 2);
353             if (count == 2) {
354                 int b0 = b[0] & 0xFF;
355                 int b1 = b[1] & 0xFF;
356                 if ((b0 != 0xFE || b1 != 0xFF) 
357                     && (b0 != 0xFF || b1 != 0xFE)) {
358                     // First two bytes are not BOM, so reset.
359                     stream.reset();
360                 }
361             }
362             else {
363                 stream.reset();
364             }
365         }
366         // We could do UTF-32, but since the getEncodingName() doesn't support that
367         // we won't support it here.
368         // To implement UTF-32, look for:  00 00 FE FF for big-endian
369         //                             or  FF FE 00 00 for little-endian
370     }
371 
372     /**
373      * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager.
374      *          Is there any way we can share the code, without having it implemented twice?
375      *          I think we should make it public and static in XMLEntityManager. --PJM
376      *
377      * Returns the IANA encoding name that is auto-detected from
378      * the bytes specified, with the endian-ness of that encoding where appropriate.
379      *
380      * @param b4    The first four bytes of the input.
381      * @return the encoding name, or null if no encoding could be detected
382      */
383     protected String   getEncodingName(byte[] b4) {
384 
385         // UTF-16, with BOM
386         int b0 = b4[0] & 0xFF;
387         int b1 = b4[1] & 0xFF;
388         if (b0 == 0xFE && b1 == 0xFF) {
389             // UTF-16, big-endian
390             return "UTF-16BE";
391         }
392         if (b0 == 0xFF && b1 == 0xFE) {
393             // UTF-16, little-endian
394             return "UTF-16LE";
395         }
396 
397         // UTF-8 with a BOM
398         int b2 = b4[2] & 0xFF;
399         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
400             return "UTF-8";
401         }
402 
403         // other encodings
404         int b3 = b4[3] & 0xFF;
405         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
406             // UCS-4, big endian (1234)
407             return "ISO-10646-UCS-4";
408         }
409         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
410             // UCS-4, little endian (4321)
411             return "ISO-10646-UCS-4";
412         }
413         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
414             // UCS-4, unusual octet order (2143)
415             return "ISO-10646-UCS-4";
416         }
417         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
418             // UCS-4, unusual octect order (3412)
419             return "ISO-10646-UCS-4";
420         }
421         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
422             // UTF-16, big-endian, no BOM
423             // (or could turn out to be UCS-2...
424             return "UTF-16BE";
425         }
426         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
427             // UTF-16, little-endian, no BOM
428             // (or could turn out to be UCS-2...
429             return "UTF-16LE";
430         }
431         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
432             // EBCDIC
433             // a la xerces1, return CP037 instead of EBCDIC here
434             return "CP037";
435         }
436 
437         // this signals us to use the value from the encoding attribute
438         return null;
439 
440     } // getEncodingName(byte[]):Object[]
441 
442     /**
443      * Read the input stream as text, and pass the text on to the XIncludeHandler
444      * using calls to characters().  This will read all of the text it can from the
445      * resource.
446      * 
447      * @throws IOException
448      */
449     public void parse() throws IOException   {
450         // REVISIT: This method needs to be rewritten to improve performance: both
451         // time and memory. We should be reading chunks and reporting chunks instead 
452         // of reading characters individually and reporting all the characters in 
453         // one callback. Also, currently we don't provide any locator information:
454         // line number, column number, etc... so if we report an error it will appear
455         // as if the invalid XML character was in the include parent. -- mrglavas
456         XMLStringBuffer buffer = new XMLStringBuffer();
457         fReader = getReader(fSource);
458         int ch;
459         while((ch = fReader.read()) != -1) {
460             if (isValid(ch)) {
461                 buffer.append((char)ch);
462             }
463             else if (XMLChar.isHighSurrogate(ch)) {
464                 int ch2 = fReader.read();
465                 if (XMLChar.isLowSurrogate(ch2)) {
466 
467                     // convert surrogates to a supplemental character
468                     int sup = XMLChar.supplemental((char)ch, (char)ch2);
469 
470                     // supplemental character must be a valid XML character
471                     if (!isValid(sup)) {
472                         fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
473                                                    "InvalidCharInContent", 
474                                                    new Object  [] { Integer.toString(sup, 16) },
475                                                    XMLErrorReporter.SEVERITY_FATAL_ERROR);
476                         continue;
477                     }                 
478                     buffer.append((char) ch);
479                     buffer.append((char) ch2);
480                 }
481                 else {
482                     fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
483                                                "InvalidCharInContent", 
484                                                new Object  [] { Integer.toString(ch, 16) },
485                                                XMLErrorReporter.SEVERITY_FATAL_ERROR);
486                 }
487             }
488             else {
489                 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
490                                            "InvalidCharInContent", 
491                                            new Object  [] { Integer.toString(ch, 16) },
492                                            XMLErrorReporter.SEVERITY_FATAL_ERROR);
493             }
494         }
495         if (fHandler != null && buffer.length > 0) {
496             fHandler.characters(
497                 buffer,
498                 fHandler.modifyAugmentations(null, true));
499         }
500     }
501     
502     /**
503      * Closes the stream.  Call this after parse(), or when there is no longer any need
504      * for this object.
505      * 
506      * @throws IOException
507      */
508     public void close() throws IOException   {
509         if (fReader != null) {
510             fReader.close();
511         }
512     }
513     
514     /**
515      * Returns true if the specified character is a valid XML character
516      * as per the rules of XML 1.0.
517      *
518      * @param ch The character to check.
519      */
520     protected boolean isValid(int ch) {
521         return XMLChar.isValid(ch);
522     }
523 }
524
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags