KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > lowagie > text > xml > simpleparser > SimpleXMLParser


1 /*
2  * Copyright 2003 Paulo Soares
3  *
4  * The contents of this file are subject to the Mozilla Public License Version 1.1
5  * (the "License"); you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at http://www.mozilla.org/MPL/
7  *
8  * Software distributed under the License is distributed on an "AS IS" basis,
9  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
10  * for the specific language governing rights and limitations under the License.
11  *
12  * The Original Code is 'iText, a free JAVA-PDF library'.
13  *
14  * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
15  * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
16  * All Rights Reserved.
17  * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
18  * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
19  *
20  * Contributor(s): all the names of the contributors are added in the source code
21  * where applicable.
22  *
23  * Alternatively, the contents of this file may be used under the terms of the
24  * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
25  * provisions of LGPL are applicable instead of those above. If you wish to
26  * allow use of your version of this file only under the terms of the LGPL
27  * License and not to allow others to use your version of this file under
28  * the MPL, indicate your decision by deleting the provisions above and
29  * replace them with the notice and other provisions required by the LGPL.
30  * If you do not delete the provisions above, a recipient may use your version
31  * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
32  *
33  * This library is free software; you can redistribute it and/or modify it
34  * under the terms of the MPL as stated above or under the terms of the GNU
35  * Library General Public License as published by the Free Software Foundation;
36  * either version 2 of the License, or any later version.
37  *
38  * This library is distributed in the hope that it will be useful, but WITHOUT
39  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
40  * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
41  * details.
42  *
43  * If you didn't download this code from the following link, you should check if
44  * you aren't using an obsolete version:
45  * http://www.lowagie.com/iText/
46  *
47  * The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license:
48  * Licensed to the Apache Software Foundation (ASF) under one or more
49  * contributor license agreements. See the NOTICE file distributed with
50  * this work for additional information regarding copyright ownership.
51  * The ASF licenses this file to You under the Apache License, Version 2.0
52  * (the "License"); you may not use this file except in compliance with
53  * the License. You may obtain a copy of the License at
54  *
55  * http://www.apache.org/licenses/LICENSE-2.0
56  *
57  * Unless required by applicable law or agreed to in writing, software
58  * distributed under the License is distributed on an "AS IS" BASIS,
59  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
60  * See the License for the specific language governing permissions and
61  * limitations under the License.
62  *
63  * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt.
64  * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128).
65  * Steven Brandt and JavaWorld gave permission to use the code for free.
66  * (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in
67  * conformance with the rest of the code).
68  * The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>.
69  * It was substantially refactored by Bruno Lowagie.
70  *
71  * The method 'private static String getEncodingName(byte[] b4)' was found
72  * in org.apache.xerces.impl.XMLEntityManager, originaly published by the
73  * Apache Software Foundation under the Apache Software License; now being
74  * used in iText under the MPL.
75  */

76 package com.lowagie.text.xml.simpleparser;
77
78 import java.io.BufferedReader JavaDoc;
79 import java.io.ByteArrayOutputStream JavaDoc;
80 import java.io.IOException JavaDoc;
81 import java.io.InputStream JavaDoc;
82 import java.io.InputStreamReader JavaDoc;
83 import java.io.Reader JavaDoc;
84 import java.util.HashMap JavaDoc;
85 import java.util.Stack JavaDoc;
86
87 /**
88  * A simple XML and HTML parser. This parser is, like the SAX parser,
89  * an event based parser, but with much less functionality.
90  * <p>
91  * The parser can:
92  * <p>
93  * <ul>
94  * <li>It recognizes the encoding used
95  * <li>It recognizes all the elements' start tags and end tags
96  * <li>It lists attributes, where attribute values can be enclosed in single or double quotes
97  * <li>It recognizes the <code>&lt;[CDATA[ ... ]]&gt;</code> construct
98  * <li>It recognizes the standard entities: &amp;amp;, &amp;lt;, &amp;gt;, &amp;quot;, and &amp;apos;, as well as numeric entities
99  * <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11
100  * </ul>
101  * <p>
102  */

103 public class SimpleXMLParser {
104     /** possible states */
105     private final static int UNKNOWN = 0;
106     private final static int TEXT = 1;
107     private final static int TAG_ENCOUNTERED = 2;
108     private final static int EXAMIN_TAG = 3;
109     private final static int TAG_EXAMINED = 4;
110     private final static int IN_CLOSETAG = 5;
111     private final static int SINGLE_TAG = 6;
112     private final static int CDATA = 7;
113     private final static int COMMENT = 8;
114     private final static int PI = 9;
115     private final static int ENTITY = 10;
116     private final static int QUOTE = 11;
117     private final static int ATTRIBUTE_KEY = 12;
118     private final static int ATTRIBUTE_EQUAL = 13;
119     private final static int ATTRIBUTE_VALUE = 14;
120     
121     /** the state stack */
122     protected Stack JavaDoc stack;
123     /** The current character. */
124     protected int character = 0;
125     /** The previous character. */
126     protected int previousCharacter = -1;
127     /** the line we are currently reading */
128     protected int lines = 1;
129     /** the column where the current character occurs */
130     protected int columns = 0;
131     /** was the last character equivalent to a newline? */
132     protected boolean eol = false;
133     /** the current state */
134     protected int state;
135     /** Are we parsing HTML? */
136     protected boolean html;
137     /** current text (whatever is encountered between tags) */
138     protected StringBuffer JavaDoc text = new StringBuffer JavaDoc();
139     /** current entity (whatever is encountered between & and ;) */
140     protected StringBuffer JavaDoc entity = new StringBuffer JavaDoc();
141     /** current tagname */
142     protected String JavaDoc tag = null;
143     /** current attributes */
144     protected HashMap JavaDoc attributes = null;
145     /** The handler to which we are going to forward document content */
146     protected SimpleXMLDocHandler doc;
147     /** The handler to which we are going to forward comments. */
148     protected SimpleXMLDocHandlerComment comment;
149     /** Keeps track of the number of tags that are open. */
150     int nested = 0;
151     /** the quote character that was used to open the quote. */
152     protected int quoteCharacter = '"';
153     /** the attribute key. */
154     String JavaDoc attributekey = null;
155     /** the attribute value. */
156     String JavaDoc attributevalue = null;
157     
158     /**
159      * Creates a Simple XML parser object.
160      * Call go(BufferedReader) immediately after creation.
161      */

162     private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) {
163         this.doc = doc;
164         this.comment = comment;
165         this.html = html;
166         stack = new Stack JavaDoc();
167         state = html ? TEXT : UNKNOWN;
168     }
169     
170     /**
171      * Does the actual parsing. Perform this immediately
172      * after creating the parser object.
173      */

174     private void go(Reader JavaDoc r) throws IOException JavaDoc {
175         BufferedReader JavaDoc reader;
176         if (r instanceof BufferedReader JavaDoc)
177             reader = (BufferedReader JavaDoc)r;
178         else
179             reader = new BufferedReader JavaDoc(r);
180         doc.startDocument();
181         while(true) {
182             // read a new character
183
if (previousCharacter == -1) {
184                 character = reader.read();
185             }
186             // or re-examin the previous character
187
else {
188                 character = previousCharacter;
189                 previousCharacter = -1;
190             }
191             
192             // the end of the file was reached
193
if (character == -1) {
194                 if (html) {
195                     if (html && state == TEXT)
196                         flush();
197                     doc.endDocument();
198                 } else {
199                     throwException("Missing end tag");
200                 }
201                 return;
202             }
203             
204             // dealing with \n and \r
205
if (character == '\n' && eol) {
206                 eol = false;
207                 continue;
208             } else if (eol) {
209                 eol = false;
210             } else if (character == '\n') {
211                 lines++;
212                 columns = 0;
213             } else if (character == '\r') {
214                 eol = true;
215                 character = '\n';
216                 lines++;
217                 columns = 0;
218             } else {
219                 columns++;
220             }
221             
222             switch(state) {
223             // we are in an unknown state before there's actual content
224
case UNKNOWN:
225                 if(character == '<') {
226                     saveState(TEXT);
227                     state = TAG_ENCOUNTERED;
228                 }
229                 break;
230             // we can encounter any content
231
case TEXT:
232                 if(character == '<') {
233                     flush();
234                     saveState(state);
235                     state = TAG_ENCOUNTERED;
236                 } else if(character == '&') {
237                     saveState(state);
238                     entity.setLength(0);
239                     state = ENTITY;
240                 } else
241                     text.append((char)character);
242                 break;
243             // we have just seen a < and are wondering what we are looking at
244
// <foo>, </foo>, <!-- ... --->, etc.
245
case TAG_ENCOUNTERED:
246                 initTag();
247                 if(character == '/') {
248                     state = IN_CLOSETAG;
249                 } else if (character == '?') {
250                     restoreState();
251                     state = PI;
252                 } else {
253                     text.append((char)character);
254                     state = EXAMIN_TAG;
255                 }
256                 break;
257             // we are processing something like this <foo ... >.
258
// It could still be a <!-- ... --> or something.
259
case EXAMIN_TAG:
260                 if(character == '>') {
261                     doTag();
262                     processTag(true);
263                     initTag();
264                     state = restoreState();
265                 } else if(character == '/') {
266                     state = SINGLE_TAG;
267                 } else if(character == '-' && text.toString().equals("!-")) {
268                     flush();
269                     state = COMMENT;
270                 } else if(character == '[' && text.toString().equals("![CDATA")) {
271                     flush();
272                     state = CDATA;
273                 } else if(character == 'E' && text.toString().equals("!DOCTYP")) {
274                     flush();
275                     state = PI;
276                 } else if(Character.isWhitespace((char)character)) {
277                     doTag();
278                     state = TAG_EXAMINED;
279                 } else {
280                     text.append((char)character);
281                 }
282                 break;
283             // we know the name of the tag now.
284
case TAG_EXAMINED:
285                 if(character == '>') {
286                     processTag(true);
287                     initTag();
288                     state = restoreState();
289                 } else if(character == '/') {
290                     state = SINGLE_TAG;
291                 } else if(Character.isWhitespace((char)character)) {
292                     // empty
293
} else {
294                     text.append((char)character);
295                     state = ATTRIBUTE_KEY;
296                 }
297                 break;
298                 
299                 // we are processing a closing tag: e.g. </foo>
300
case IN_CLOSETAG:
301                 if(character == '>') {
302                     doTag();
303                     processTag(false);
304                     if(!html && nested==0) return;
305                     state = restoreState();
306                 } else {
307                     if (!Character.isWhitespace((char)character))
308                         text.append((char)character);
309                 }
310                 break;
311                 
312             // we have just seen something like this: <foo a="b"/
313
// and are looking for the final >.
314
case SINGLE_TAG:
315                 if(character != '>')
316                     throwException("Expected > for tag: <"+tag+"/>");
317                 doTag();
318                 processTag(true);
319                 processTag(false);
320                 initTag();
321                 if(!html && nested==0) {
322                     doc.endDocument();
323                     return;
324                 }
325                 state = restoreState();
326                 break;
327                 
328             // we are processing CDATA
329
case CDATA:
330                 if(character == '>'
331                 && text.toString().endsWith("]]")) {
332                     text.setLength(text.length()-2);
333                     flush();
334                     state = restoreState();
335                 } else
336                     text.append((char)character);
337                 break;
338                 
339             // we are processing a comment. We are inside
340
// the <!-- .... --> looking for the -->.
341
case COMMENT:
342                 if(character == '>'
343                 && text.toString().endsWith("--")) {
344                     text.setLength(text.length() - 2);
345                     flush();
346                     state = restoreState();
347                 } else
348                     text.append((char)character);
349                 break;
350                 
351             // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
352
case PI:
353                 if(character == '>') {
354                     state = restoreState();
355                     if(state == TEXT) state = UNKNOWN;
356                 }
357                 break;
358                 
359             // we are processing an entity, e.g. &lt;, &#187;, etc.
360
case ENTITY:
361                 if(character == ';') {
362                     state = restoreState();
363                     String JavaDoc cent = entity.toString();
364                     entity.setLength(0);
365                     char ce = EntitiesToUnicode.decodeEntity(cent);
366                     if (ce == '\0')
367                         text.append('&').append(cent).append(';');
368                     else
369                         text.append(ce);
370                 } else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z')
371                     && (character < 'A' || character > 'Z')) || entity.length() >= 7) {
372                     state = restoreState();
373                     previousCharacter = character;
374                     text.append('&').append(entity.toString());
375                     entity.setLength(0);
376                 }
377                 else {
378                     entity.append((char)character);
379                 }
380                 break;
381             // We are processing the quoted right-hand side of an element's attribute.
382
case QUOTE:
383                 if (html && quoteCharacter == ' ' && character == '>') {
384                     flush();
385                     processTag(true);
386                     initTag();
387                     state = restoreState();
388                 }
389                 else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) {
390                     flush();
391                     state = TAG_EXAMINED;
392                 }
393                 else if (html && quoteCharacter == ' ') {
394                     text.append((char)character);
395                 }
396                 else if(character == quoteCharacter) {
397                     flush();
398                     state = TAG_EXAMINED;
399                 } else if(" \r\n\u0009".indexOf(character)>=0) {
400                     text.append(' ');
401                 } else if(character == '&') {
402                     saveState(state);
403                     state = ENTITY;
404                     entity.setLength(0);
405                 } else {
406                     text.append((char)character);
407                 }
408                 break;
409                 
410             case ATTRIBUTE_KEY:
411                 if(Character.isWhitespace((char)character)) {
412                     flush();
413                     state = ATTRIBUTE_EQUAL;
414                 } else if(character == '=') {
415                     flush();
416                     state = ATTRIBUTE_VALUE;
417                 } else if (html && character == '>') {
418                     text.setLength(0);
419                     processTag(true);
420                     initTag();
421                     state = restoreState();
422                 } else {
423                     text.append((char)character);
424                 }
425                 break;
426                 
427             case ATTRIBUTE_EQUAL:
428                 if(character == '=') {
429                     state = ATTRIBUTE_VALUE;
430                 } else if(Character.isWhitespace((char)character)) {
431                     // empty
432
} else if (html && character == '>') {
433                     text.setLength(0);
434                     processTag(true);
435                     initTag();
436                     state = restoreState();
437                 } else if (html && character == '/') {
438                     flush();
439                     state = SINGLE_TAG;
440                 } else if (html) {
441                     flush();
442                     text.append((char)character);
443                     state = ATTRIBUTE_KEY;
444                 } else {
445                     throwException("Error in attribute processing.");
446                 }
447                 break;
448                 
449             case ATTRIBUTE_VALUE:
450                 if(character == '"' || character == '\'') {
451                     quoteCharacter = character;
452                     state = QUOTE;
453                 } else if(Character.isWhitespace((char)character)) {
454                     // empty
455
} else if (html && character == '>') {
456                     flush();
457                     processTag(true);
458                     initTag();
459                     state = restoreState();
460                 } else if (html) {
461                     text.append((char)character);
462                     quoteCharacter = ' ';
463                     state = QUOTE;
464                 } else {
465                     throwException("Error in attribute processing");
466                 }
467                 break;
468             }
469         }
470     }
471
472     /**
473      * Gets a state from the stack
474      * @return the previous state
475      */

476     private int restoreState() {
477         if(!stack.empty())
478             return ((Integer JavaDoc)stack.pop()).intValue();
479         else
480             return UNKNOWN;
481     }
482     /**
483      * Adds a state to the stack.
484      * @param s a state to add to the stack
485      */

486     private void saveState(int s) {
487         stack.push(new Integer JavaDoc(s));
488     }
489     /**
490      * Flushes the text that is currently in the buffer.
491      * The text can be ignored, added to the document
492      * as content or as comment,... depending on the current state.
493      */

494     private void flush() {
495         switch(state){
496         case TEXT:
497         case CDATA:
498             if(text.length() > 0) {
499                 doc.text(text.toString());
500             }
501             break;
502         case COMMENT:
503             if (comment != null) {
504                 comment.comment(text.toString());
505             }
506             break;
507         case ATTRIBUTE_KEY:
508             attributekey = text.toString();
509             if (html)
510                 attributekey = attributekey.toLowerCase();
511             break;
512         case QUOTE:
513         case ATTRIBUTE_VALUE:
514             attributevalue = text.toString();
515             attributes.put(attributekey,attributevalue);
516             break;
517         default:
518             // do nothing
519
}
520         text.setLength(0);
521     }
522     /**
523      * Initialized the tag name and attributes.
524      */

525     private void initTag() {
526         tag = null;
527         attributes = new HashMap JavaDoc();
528     }
529     /** Sets the name of the tag. */
530     private void doTag() {
531         if(tag == null)
532             tag = text.toString();
533         if (html)
534             tag = tag.toLowerCase();
535         text.setLength(0);
536     }
537     /**
538      * processes the tag.
539      * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag.
540      */

541     private void processTag(boolean start) {
542         if (start) {
543             nested++;
544             doc.startElement(tag,attributes);
545         }
546         else {
547             nested--;
548             doc.endElement(tag);
549         }
550     }
551     /** Throws an exception */
552     private void throwException(String JavaDoc s) throws IOException JavaDoc {
553         throw new IOException JavaDoc(s+" near line " + lines + ", column " + columns);
554     }
555     
556     /**
557      * Parses the XML document firing the events to the handler.
558      * @param doc the document handler
559      * @param r the document. The encoding is already resolved. The reader is not closed
560      * @throws IOException on error
561      */

562     public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader JavaDoc r, boolean html) throws IOException JavaDoc {
563         SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html);
564         parser.go(r);
565     }
566     
567     /**
568      * Parses the XML document firing the events to the handler.
569      * @param doc the document handler
570      * @param in the document. The encoding is deduced from the stream. The stream is not closed
571      * @throws IOException on error
572      */

573     public static void parse(SimpleXMLDocHandler doc, InputStream JavaDoc in) throws IOException JavaDoc {
574         byte b4[] = new byte[4];
575         int count = in.read(b4);
576         if (count != 4)
577             throw new IOException JavaDoc("Insufficient length.");
578         String JavaDoc encoding = getEncodingName(b4);
579         String JavaDoc decl = null;
580         if (encoding.equals("UTF-8")) {
581             StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
582             int c;
583             while ((c = in.read()) != -1) {
584                 if (c == '>')
585                     break;
586                 sb.append((char)c);
587             }
588             decl = sb.toString();
589         }
590         else if (encoding.equals("CP037")) {
591             ByteArrayOutputStream JavaDoc bi = new ByteArrayOutputStream JavaDoc();
592             int c;
593             while ((c = in.read()) != -1) {
594                 if (c == 0x6e) // that's '>' in ebcdic
595
break;
596                 bi.write(c);
597             }
598             decl = new String JavaDoc(bi.toByteArray(), "CP037");
599         }
600         if (decl != null) {
601             decl = getDeclaredEncoding(decl);
602             if (decl != null)
603                 encoding = decl;
604         }
605         parse(doc, new InputStreamReader JavaDoc(in, IanaEncodings.getJavaEncoding(encoding)));
606     }
607     
608     private static String JavaDoc getDeclaredEncoding(String JavaDoc decl) {
609         if (decl == null)
610             return null;
611         int idx = decl.indexOf("encoding");
612         if (idx < 0)
613             return null;
614         int idx1 = decl.indexOf('"', idx);
615         int idx2 = decl.indexOf('\'', idx);
616         if (idx1 == idx2)
617             return null;
618         if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) {
619             int idx3 = decl.indexOf('\'', idx2 + 1);
620             if (idx3 < 0)
621                 return null;
622             return decl.substring(idx2 + 1, idx3);
623         }
624         if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) {
625             int idx3 = decl.indexOf('"', idx1 + 1);
626             if (idx3 < 0)
627                 return null;
628             return decl.substring(idx1 + 1, idx3);
629         }
630         return null;
631     }
632     
633     public static void parse(SimpleXMLDocHandler doc,Reader JavaDoc r) throws IOException JavaDoc {
634         parse(doc, null, r, false);
635     }
636     
637     /**
638      * Escapes a string with the appropriated XML codes.
639      * @param s the string to be escaped
640      * @param onlyASCII codes above 127 will always be escaped with &amp;#nn; if <CODE>true</CODE>
641      * @return the escaped string
642      */

643     public static String JavaDoc escapeXML(String JavaDoc s, boolean onlyASCII) {
644         char cc[] = s.toCharArray();
645         int len = cc.length;
646         StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
647         for (int k = 0; k < len; ++k) {
648             int c = cc[k];
649             switch (c) {
650                 case '<':
651                     sb.append("&lt;");
652                     break;
653                 case '>':
654                     sb.append("&gt;");
655                     break;
656                 case '&':
657                     sb.append("&amp;");
658                     break;
659                 case '"':
660                     sb.append("&quot;");
661                     break;
662                 case '\'':
663                     sb.append("&apos;");
664                     break;
665                 default:
666                     if (onlyASCII && c > 127)
667                         sb.append("&#").append(c).append(';');
668                     else
669                         sb.append((char)c);
670             }
671         }
672         return sb.toString();
673     }
674     /**
675      * Returns the IANA encoding name that is auto-detected from
676      * the bytes specified, with the endian-ness of that encoding where appropriate.
677      * (method found in org.apache.xerces.impl.XMLEntityManager, originaly published
678      * by the Apache Software Foundation under the Apache Software License; now being
679      * used in iText under the MPL)
680      * @param b4 The first four bytes of the input.
681      * @return an IANA-encoding string
682      */

683     private static String JavaDoc getEncodingName(byte[] b4) {
684         
685         // UTF-16, with BOM
686
int b0 = b4[0] & 0xFF;
687         int b1 = b4[1] & 0xFF;
688         if (b0 == 0xFE && b1 == 0xFF) {
689             // UTF-16, big-endian
690
return "UTF-16BE";
691         }
692         if (b0 == 0xFF && b1 == 0xFE) {
693             // UTF-16, little-endian
694
return "UTF-16LE";
695         }
696         
697         // UTF-8 with a BOM
698
int b2 = b4[2] & 0xFF;
699         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
700             return "UTF-8";
701         }
702         
703         // other encodings
704
int b3 = b4[3] & 0xFF;
705         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
706             // UCS-4, big endian (1234)
707
return "ISO-10646-UCS-4";
708         }
709         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
710             // UCS-4, little endian (4321)
711
return "ISO-10646-UCS-4";
712         }
713         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
714             // UCS-4, unusual octet order (2143)
715
// REVISIT: What should this be?
716
return "ISO-10646-UCS-4";
717         }
718         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
719             // UCS-4, unusual octect order (3412)
720
// REVISIT: What should this be?
721
return "ISO-10646-UCS-4";
722         }
723         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
724             // UTF-16, big-endian, no BOM
725
// (or could turn out to be UCS-2...
726
// REVISIT: What should this be?
727
return "UTF-16BE";
728         }
729         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
730             // UTF-16, little-endian, no BOM
731
// (or could turn out to be UCS-2...
732
return "UTF-16LE";
733         }
734         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
735             // EBCDIC
736
// a la xerces1, return CP037 instead of EBCDIC here
737
return "CP037";
738         }
739         
740         // default encoding
741
return "UTF-8";
742     }
743 }
Popular Tags