SimpleXMLParser


1   /*
2    * Copyright 2003 Paulo Soares
3    *
4    * The contents of this file are subject to the Mozilla Public License Version 1.1
5    * (the "License"); you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at http://www.mozilla.org/MPL/
7    *
8    * Software distributed under the License is distributed on an "AS IS" basis,
9    * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
10   * for the specific language governing rights and limitations under the License.
11   *
12   * The Original Code is 'iText, a free JAVA-PDF library'.
13   *
14   * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
15   * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
16   * All Rights Reserved.
17   * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
18   * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
19   *
20   * Contributor(s): all the names of the contributors are added in the source code
21   * where applicable.
22   *
23   * Alternatively, the contents of this file may be used under the terms of the
24   * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
25   * provisions of LGPL are applicable instead of those above.  If you wish to
26   * allow use of your version of this file only under the terms of the LGPL
27   * License and not to allow others to use your version of this file under
28   * the MPL, indicate your decision by deleting the provisions above and
29   * replace them with the notice and other provisions required by the LGPL.
30   * If you do not delete the provisions above, a recipient may use your version
31   * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
32   *
33   * This library is free software; you can redistribute it and/or modify it
34   * under the terms of the MPL as stated above or under the terms of the GNU
35   * Library General Public License as published by the Free Software Foundation;
36   * either version 2 of the License, or any later version.
37   *
38   * This library is distributed in the hope that it will be useful, but WITHOUT
39   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
40   * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
41   * details.
42   *
43   * If you didn't download this code from the following link, you should check if
44   * you aren't using an obsolete version:
45   * http://www.lowagie.com/iText/
46   *
47   * The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license:
48   * Licensed to the Apache Software Foundation (ASF) under one or more
49   * contributor license agreements.  See the NOTICE file distributed with
50   * this work for additional information regarding copyright ownership.
51   * The ASF licenses this file to You under the Apache License, Version 2.0
52   * (the "License"); you may not use this file except in compliance with
53   * the License.  You may obtain a copy of the License at
54   * 
55   *      http://www.apache.org/licenses/LICENSE-2.0
56   * 
57   * Unless required by applicable law or agreed to in writing, software
58   * distributed under the License is distributed on an "AS IS" BASIS,
59   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
60   * See the License for the specific language governing permissions and
61   * limitations under the License.
62   * 
63   * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt.
64   * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128).
65   * Steven Brandt and JavaWorld gave permission to use the code for free.
66   * (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in
67   * conformance with the rest of the code).
68   * The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>.
69   * It was substantially refactored by Bruno Lowagie.
70   * 
71   * The method 'private static String getEncodingName(byte[] b4)' was found
72   * in org.apache.xerces.impl.XMLEntityManager, originaly published by the
73   * Apache Software Foundation under the Apache Software License; now being
74   * used in iText under the MPL.
75   */
76  package com.lowagie.text.xml.simpleparser;
77  
78  import java.io.BufferedReader  ;
79  import java.io.ByteArrayOutputStream  ;
80  import java.io.IOException  ;
81  import java.io.InputStream  ;
82  import java.io.InputStreamReader  ;
83  import java.io.Reader  ;
84  import java.util.HashMap  ;
85  import java.util.Stack  ;
86  
87  /**
88   * A simple XML and HTML parser.  This parser is, like the SAX parser,
89   * an event based parser, but with much less functionality.
90   * <p>
91   * The parser can:
92   * <p>
93   * <ul>
94   * <li>It recognizes the encoding used
95   * <li>It recognizes all the elements' start tags and end tags
96   * <li>It lists attributes, where attribute values can be enclosed in single or double quotes
97   * <li>It recognizes the <code>&lt;[CDATA[ ... ]]&gt;</code> construct
98   * <li>It recognizes the standard entities: &amp;amp;, &amp;lt;, &amp;gt;, &amp;quot;, and &amp;apos;, as well as numeric entities
99   * <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11
100  * </ul>
101  * <p>
102  */
103 public class SimpleXMLParser {
104     /** possible states */
105     private final static int UNKNOWN = 0;
106     private final static int TEXT = 1;
107     private final static int TAG_ENCOUNTERED = 2;
108     private final static int EXAMIN_TAG = 3;
109     private final static int TAG_EXAMINED = 4;
110     private final static int IN_CLOSETAG = 5;
111     private final static int SINGLE_TAG = 6;
112     private final static int CDATA = 7;
113     private final static int COMMENT = 8;
114     private final static int PI = 9;
115     private final static int ENTITY = 10;
116     private final static int QUOTE = 11;
117     private final static int ATTRIBUTE_KEY = 12;
118     private final static int ATTRIBUTE_EQUAL = 13;
119     private final static int ATTRIBUTE_VALUE = 14;
120     
121     /** the state stack */
122     protected Stack   stack;
123     /** The current character. */
124     protected int character = 0;
125     /** The previous character. */
126     protected int previousCharacter = -1;
127     /** the line we are currently reading */
128     protected int lines = 1;
129     /** the column where the current character occurs */
130     protected int columns = 0;
131     /** was the last character equivalent to a newline? */
132     protected boolean eol = false;
133     /** the current state */
134     protected int state;
135     /** Are we parsing HTML? */
136     protected boolean html;
137     /** current text (whatever is encountered between tags) */
138     protected StringBuffer   text = new StringBuffer  ();
139     /** current entity (whatever is encountered between & and ;) */
140     protected StringBuffer   entity = new StringBuffer  ();
141     /** current tagname */
142     protected String   tag = null;
143     /** current attributes */
144     protected HashMap   attributes = null;
145     /** The handler to which we are going to forward document content */
146     protected SimpleXMLDocHandler doc;
147     /** The handler to which we are going to forward comments. */
148     protected SimpleXMLDocHandlerComment comment;
149     /** Keeps track of the number of tags that are open. */
150     int nested = 0;
151     /** the quote character that was used to open the quote. */
152     protected int quoteCharacter = '"';
153     /** the attribute key. */
154     String   attributekey = null;
155     /** the attribute value. */
156     String   attributevalue = null;
157     
158     /**
159      * Creates a Simple XML parser object.
160      * Call go(BufferedReader) immediately after creation.
161      */
162     private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) {
163         this.doc = doc;
164         this.comment = comment;
165         this.html = html;
166         stack = new Stack  ();
167         state = html ? TEXT : UNKNOWN;
168     }
169     
170     /**
171      * Does the actual parsing. Perform this immediately
172      * after creating the parser object.
173      */
174     private void go(Reader   r) throws IOException   {
175         BufferedReader   reader;
176         if (r instanceof BufferedReader  )
177             reader = (BufferedReader  )r;
178         else
179             reader = new BufferedReader  (r);
180         doc.startDocument();
181         while(true) {
182             // read a new character
183             if (previousCharacter == -1) {
184                 character = reader.read();
185             }
186             // or re-examin the previous character
187             else {
188                 character = previousCharacter;
189                 previousCharacter = -1;
190             }
191             
192             // the end of the file was reached
193             if (character == -1) {
194                 if (html) {
195                     if (html && state == TEXT)
196                         flush();
197                     doc.endDocument();
198                 } else {
199                     throwException("Missing end tag");
200                 }
201                 return;
202             }
203             
204             // dealing with  \n and \r
205             if (character == '\n' && eol) {
206                 eol = false;
207                 continue;
208             } else if (eol) {
209                 eol = false;
210             } else if (character == '\n') {
211                 lines++;
212                 columns = 0;
213             } else if (character == '\r') {
214                 eol = true;
215                 character = '\n';
216                 lines++;
217                 columns = 0;
218             } else {
219                 columns++;
220             }
221             
222             switch(state) {
223             // we are in an unknown state before there's actual content
224             case UNKNOWN:
225                 if(character == '<') {
226                     saveState(TEXT);
227                     state = TAG_ENCOUNTERED;
228                 }
229                 break;
230             // we can encounter any content
231             case TEXT:
232                 if(character == '<') {
233                     flush();
234                     saveState(state);
235                     state = TAG_ENCOUNTERED;
236                 } else if(character == '&') {
237                     saveState(state);
238                     entity.setLength(0);
239                     state = ENTITY;
240                 } else
241                     text.append((char)character);
242                 break;
243             // we have just seen a < and are wondering what we are looking at
244             // <foo>, </foo>, <!-- ... --->, etc.
245             case TAG_ENCOUNTERED:
246                 initTag();
247                 if(character == '/') {
248                     state = IN_CLOSETAG;
249                 } else if (character == '?') {
250                     restoreState();
251                     state = PI;
252                 } else {
253                     text.append((char)character);
254                     state = EXAMIN_TAG;
255                 }
256                 break;
257             // we are processing something like this <foo ... >.
258             // It could still be a <!-- ... --> or something.
259             case EXAMIN_TAG:
260                 if(character == '>') {
261                     doTag();
262                     processTag(true);
263                     initTag();
264                     state = restoreState();
265                 } else if(character == '/') {
266                     state = SINGLE_TAG;
267                 } else if(character == '-' && text.toString().equals("!-")) {
268                     flush();
269                     state = COMMENT;
270                 } else if(character == '[' && text.toString().equals("![CDATA")) {
271                     flush();
272                     state = CDATA;
273                 } else if(character == 'E' && text.toString().equals("!DOCTYP")) {
274                     flush();
275                     state = PI;
276                 } else if(Character.isWhitespace((char)character)) {
277                     doTag();
278                     state = TAG_EXAMINED;
279                 } else {
280                     text.append((char)character);
281                 }
282                 break;
283             // we know the name of the tag now.
284             case TAG_EXAMINED:
285                 if(character == '>') {
286                     processTag(true);
287                     initTag();
288                     state = restoreState();
289                 } else if(character == '/') {
290                     state = SINGLE_TAG;
291                 } else if(Character.isWhitespace((char)character)) {
292                     // empty
293                 } else {
294                     text.append((char)character);
295                     state = ATTRIBUTE_KEY;
296                 }
297                 break;
298                 
299                 // we are processing a closing tag: e.g. </foo>
300             case IN_CLOSETAG:
301                 if(character == '>') {
302                     doTag();
303                     processTag(false);
304                     if(!html && nested==0) return;
305                     state = restoreState();
306                 } else {
307                     if (!Character.isWhitespace((char)character))
308                         text.append((char)character);
309                 }
310                 break;
311                 
312             // we have just seen something like this: <foo a="b"/
313             // and are looking for the final >.
314             case SINGLE_TAG:
315                 if(character != '>')
316                     throwException("Expected > for tag: <"+tag+"/>");
317                 doTag();
318                 processTag(true);
319                 processTag(false);
320                 initTag();
321                 if(!html && nested==0) {
322                     doc.endDocument();
323                     return;
324                 }
325                 state = restoreState();
326                 break;
327                 
328             // we are processing CDATA
329             case CDATA:
330                 if(character == '>'
331                 && text.toString().endsWith("]]")) {
332                     text.setLength(text.length()-2);
333                     flush();
334                     state = restoreState();
335                 } else
336                     text.append((char)character);
337                 break;
338                 
339             // we are processing a comment.  We are inside
340             // the <!-- .... --> looking for the -->.
341             case COMMENT:
342                 if(character == '>'
343                 && text.toString().endsWith("--")) {
344                     text.setLength(text.length() - 2);
345                     flush();
346                     state = restoreState();
347                 } else
348                     text.append((char)character);
349                 break;
350                 
351             // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
352             case PI:
353                 if(character == '>') {
354                     state = restoreState();
355                     if(state == TEXT) state = UNKNOWN;
356                 }
357                 break;
358                 
359             // we are processing an entity, e.g. &lt;, &#187;, etc.
360             case ENTITY:
361                 if(character == ';') {
362                     state = restoreState();
363                     String   cent = entity.toString();
364                     entity.setLength(0);
365                     char ce = EntitiesToUnicode.decodeEntity(cent);
366                     if (ce == '\0')
367                         text.append('&').append(cent).append(';');
368                     else
369                         text.append(ce);
370                 } else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z')
371                     && (character < 'A' || character > 'Z')) || entity.length() >= 7) {
372                     state = restoreState();
373                     previousCharacter = character;
374                     text.append('&').append(entity.toString());
375                     entity.setLength(0);
376                 }
377                 else {
378                     entity.append((char)character);
379                 }
380                 break;
381             // We are processing the quoted right-hand side of an element's attribute.
382             case QUOTE:
383                 if (html && quoteCharacter == ' ' && character == '>') {
384                     flush();
385                     processTag(true);
386                     initTag();
387                     state = restoreState();
388                 }
389                 else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) {
390                     flush();
391                     state = TAG_EXAMINED;
392                 }
393                 else if (html && quoteCharacter == ' ') {
394                     text.append((char)character);
395                 }
396                 else if(character == quoteCharacter) {
397                     flush();
398                     state = TAG_EXAMINED;
399                 } else if(" \r\n\u0009".indexOf(character)>=0) {
400                     text.append(' ');
401                 } else if(character == '&') {
402                     saveState(state);
403                     state = ENTITY;
404                     entity.setLength(0);
405                 } else {
406                     text.append((char)character);
407                 }
408                 break;
409                 
410             case ATTRIBUTE_KEY:
411                 if(Character.isWhitespace((char)character)) {
412                     flush();
413                     state = ATTRIBUTE_EQUAL;
414                 } else if(character == '=') {
415                     flush();
416                     state = ATTRIBUTE_VALUE;
417                 } else if (html && character == '>') {
418                     text.setLength(0);
419                     processTag(true);
420                     initTag();
421                     state = restoreState();
422                 } else {
423                     text.append((char)character);
424                 }
425                 break;
426                 
427             case ATTRIBUTE_EQUAL:
428                 if(character == '=') {
429                     state = ATTRIBUTE_VALUE;
430                 } else if(Character.isWhitespace((char)character)) {
431                     // empty
432                 } else if (html && character == '>') {
433                     text.setLength(0);
434                     processTag(true);
435                     initTag();
436                     state = restoreState();
437                 } else if (html && character == '/') {
438                     flush();
439                     state = SINGLE_TAG;
440                 } else if (html) {
441                     flush();
442                     text.append((char)character);
443                     state = ATTRIBUTE_KEY;
444                 } else {
445                     throwException("Error in attribute processing.");
446                 }
447                 break;
448                 
449             case ATTRIBUTE_VALUE:
450                 if(character == '"' || character == '\'') {
451                     quoteCharacter = character;
452                     state = QUOTE;
453                 } else if(Character.isWhitespace((char)character)) {
454                     // empty
455                 } else if (html && character == '>') {
456                     flush();
457                     processTag(true);
458                     initTag();
459                     state = restoreState();
460                 } else if (html) {
461                     text.append((char)character);
462                     quoteCharacter = ' ';
463                     state = QUOTE;
464                 } else {
465                     throwException("Error in attribute processing");
466                 }
467                 break;
468             }
469         }
470     }
471 
472     /**
473      * Gets a state from the stack
474      * @return the previous state
475      */
476     private int restoreState() {
477         if(!stack.empty())
478             return ((Integer  )stack.pop()).intValue();
479         else
480             return UNKNOWN;
481     }
482     /**
483      * Adds a state to the stack.
484      * @param   s   a state to add to the stack
485      */
486     private void saveState(int s) {
487         stack.push(new Integer  (s));
488     }
489     /**
490      * Flushes the text that is currently in the buffer.
491      * The text can be ignored, added to the document
492      * as content or as comment,... depending on the current state.
493      */
494     private void flush() {
495         switch(state){
496         case TEXT:
497         case CDATA:
498             if(text.length() > 0) {
499                 doc.text(text.toString());
500             }
501             break;
502         case COMMENT:
503             if (comment != null) {
504                 comment.comment(text.toString());
505             }
506             break;
507         case ATTRIBUTE_KEY:
508             attributekey = text.toString();
509             if (html)
510                 attributekey = attributekey.toLowerCase();
511             break;
512         case QUOTE:
513         case ATTRIBUTE_VALUE:
514             attributevalue = text.toString();
515             attributes.put(attributekey,attributevalue);
516             break;
517         default:
518             // do nothing
519         }
520         text.setLength(0);
521     }
522     /**
523      * Initialized the tag name and attributes.
524      */
525     private void initTag() {
526         tag = null;
527         attributes = new HashMap  ();
528     }
529     /** Sets the name of the tag. */
530     private void doTag() {
531         if(tag == null)
532             tag = text.toString();
533         if (html)
534             tag = tag.toLowerCase();
535         text.setLength(0);
536     }
537     /**
538      * processes the tag.
539      * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag.
540      */
541     private void processTag(boolean start) {
542         if (start) {
543             nested++;
544             doc.startElement(tag,attributes);
545         }
546         else {
547             nested--;
548             doc.endElement(tag);
549         }
550     }
551     /** Throws an exception */
552     private void throwException(String   s) throws IOException   {
553         throw new IOException  (s+" near line " + lines + ", column " + columns);
554     }
555     
556     /**
557      * Parses the XML document firing the events to the handler.
558      * @param doc the document handler
559      * @param r the document. The encoding is already resolved. The reader is not closed
560      * @throws IOException on error
561      */
562     public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader   r, boolean html) throws IOException   {
563         SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html);
564         parser.go(r);
565     }
566     
567     /**
568      * Parses the XML document firing the events to the handler.
569      * @param doc the document handler
570      * @param in the document. The encoding is deduced from the stream. The stream is not closed
571      * @throws IOException on error
572      */    
573     public static void parse(SimpleXMLDocHandler doc, InputStream   in) throws IOException   {
574         byte b4[] = new byte[4];
575         int count = in.read(b4);
576         if (count != 4)
577             throw new IOException  ("Insufficient length.");
578         String   encoding = getEncodingName(b4);
579         String   decl = null;
580         if (encoding.equals("UTF-8")) {
581             StringBuffer   sb = new StringBuffer  ();
582             int c;
583             while ((c = in.read()) != -1) {
584                 if (c == '>')
585                     break;
586                 sb.append((char)c);
587             }
588             decl = sb.toString();
589         }
590         else if (encoding.equals("CP037")) {
591             ByteArrayOutputStream   bi = new ByteArrayOutputStream  ();
592             int c;
593             while ((c = in.read()) != -1) {
594                 if (c == 0x6e) // that's '>' in ebcdic
595                     break;
596                 bi.write(c);
597             }
598             decl = new String  (bi.toByteArray(), "CP037");
599         }
600         if (decl != null) {
601             decl = getDeclaredEncoding(decl);
602             if (decl != null)
603                 encoding = decl;
604         }
605         parse(doc, new InputStreamReader  (in, IanaEncodings.getJavaEncoding(encoding)));
606     }
607     
608     private static String   getDeclaredEncoding(String   decl) {
609         if (decl == null)
610             return null;
611         int idx = decl.indexOf("encoding");
612         if (idx < 0)
613             return null;
614         int idx1 = decl.indexOf('"', idx);
615         int idx2 = decl.indexOf('\'', idx);
616         if (idx1 == idx2)
617             return null;
618         if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) {
619             int idx3 = decl.indexOf('\'', idx2 + 1);
620             if (idx3 < 0)
621                 return null;
622             return decl.substring(idx2 + 1, idx3);
623         }
624         if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) {
625             int idx3 = decl.indexOf('"', idx1 + 1);
626             if (idx3 < 0)
627                 return null;
628             return decl.substring(idx1 + 1, idx3);
629         }
630         return null;
631     }
632     
633     public static void parse(SimpleXMLDocHandler doc,Reader   r) throws IOException   {
634         parse(doc, null, r, false);
635     }
636     
637     /**
638      * Escapes a string with the appropriated XML codes.
639      * @param s the string to be escaped
640      * @param onlyASCII codes above 127 will always be escaped with &amp;#nn; if <CODE>true</CODE>
641      * @return the escaped string
642      */    
643     public static String   escapeXML(String   s, boolean onlyASCII) {
644         char cc[] = s.toCharArray();
645         int len = cc.length;
646         StringBuffer   sb = new StringBuffer  ();
647         for (int k = 0; k < len; ++k) {
648             int c = cc[k];
649             switch (c) {
650                 case '<':
651                     sb.append("&lt;");
652                     break;
653                 case '>':
654                     sb.append("&gt;");
655                     break;
656                 case '&':
657                     sb.append("&amp;");
658                     break;
659                 case '"':
660                     sb.append("&quot;");
661                     break;
662                 case '\'':
663                     sb.append("&apos;");
664                     break;
665                 default:
666                     if (onlyASCII && c > 127)
667                         sb.append("&#").append(c).append(';');
668                     else
669                         sb.append((char)c);
670             }
671         }
672         return sb.toString();
673     }
674     /**
675      * Returns the IANA encoding name that is auto-detected from
676      * the bytes specified, with the endian-ness of that encoding where appropriate.
677      * (method found in org.apache.xerces.impl.XMLEntityManager, originaly published
678      * by the Apache Software Foundation under the Apache Software License; now being
679      * used in iText under the MPL)
680      * @param b4    The first four bytes of the input.
681      * @return an IANA-encoding string
682      */
683     private static String   getEncodingName(byte[] b4) {
684         
685         // UTF-16, with BOM
686         int b0 = b4[0] & 0xFF;
687         int b1 = b4[1] & 0xFF;
688         if (b0 == 0xFE && b1 == 0xFF) {
689             // UTF-16, big-endian
690             return "UTF-16BE";
691         }
692         if (b0 == 0xFF && b1 == 0xFE) {
693             // UTF-16, little-endian
694             return "UTF-16LE";
695         }
696         
697         // UTF-8 with a BOM
698         int b2 = b4[2] & 0xFF;
699         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
700             return "UTF-8";
701         }
702         
703         // other encodings
704         int b3 = b4[3] & 0xFF;
705         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
706             // UCS-4, big endian (1234)
707             return "ISO-10646-UCS-4";
708         }
709         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
710             // UCS-4, little endian (4321)
711             return "ISO-10646-UCS-4";
712         }
713         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
714             // UCS-4, unusual octet order (2143)
715             // REVISIT: What should this be?
716             return "ISO-10646-UCS-4";
717         }
718         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
719             // UCS-4, unusual octect order (3412)
720             // REVISIT: What should this be?
721             return "ISO-10646-UCS-4";
722         }
723         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
724             // UTF-16, big-endian, no BOM
725             // (or could turn out to be UCS-2...
726             // REVISIT: What should this be?
727             return "UTF-16BE";
728         }
729         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
730             // UTF-16, little-endian, no BOM
731             // (or could turn out to be UCS-2...
732             return "UTF-16LE";
733         }
734         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
735             // EBCDIC
736             // a la xerces1, return CP037 instead of EBCDIC here
737             return "CP037";
738         }
739         
740         // default encoding
741         return "UTF-8";
742     }
743 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags