SimpleXMLParser


1   /*
2    * Copyright 2003 Paulo Soares
3    *
4    * The contents of this file are subject to the Mozilla Public License Version 1.1
5    * (the "License"); you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at http://www.mozilla.org/MPL/
7    *
8    * Software distributed under the License is distributed on an "AS IS" basis,
9    * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
10   * for the specific language governing rights and limitations under the License.
11   *
12   * The Original Code is 'iText, a free JAVA-PDF library'.
13   *
14   * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
15   * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
16   * All Rights Reserved.
17   * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
18   * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
19   *
20   * Contributor(s): all the names of the contributors are added in the source code
21   * where applicable.
22   *
23   * Alternatively, the contents of this file may be used under the terms of the
24   * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
25   * provisions of LGPL are applicable instead of those above.  If you wish to
26   * allow use of your version of this file only under the terms of the LGPL
27   * License and not to allow others to use your version of this file under
28   * the MPL, indicate your decision by deleting the provisions above and
29   * replace them with the notice and other provisions required by the LGPL.
30   * If you do not delete the provisions above, a recipient may use your version
31   * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
32   *
33   * This library is free software; you can redistribute it and/or modify it
34   * under the terms of the MPL as stated above or under the terms of the GNU
35   * Library General Public License as published by the Free Software Foundation;
36   * either version 2 of the License, or any later version.
37   *
38   * This library is distributed in the hope that it will be useful, but WITHOUT
39   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
40   * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
41   * details.
42   *
43   * If you didn't download this code from the following link, you should check if
44   * you aren't using an obsolete version:
45   * http://www.lowagie.com/iText/
46   */
47  package com.lowagie.text.pdf;
48  
49  import java.io.*;
50  import java.util.Stack;
51  import java.util.HashMap;
52  
53  /**
54   * A simple XML and HTML parser.  This parser is, like the SAX parser,
55   * an event based parser, but with much less functionality.
56   * <p>
57   * The parser can:
58   * <p>
59   * <ul>
60   * <li>It recognizes the encoding used
61   * <li>It recognizes all the elements' start tags and end tags
62   * <li>It lists attributes, where attribute values can be enclosed in single or double quotes
63   * <li>It recognizes the <code>&lt;[CDATA[ ... ]]&gt;</code> construct
64   * <li>It recognizes the standard entities: &amp;amp;, &amp;lt;, &amp;gt;, &amp;quot;, and &amp;apos;, as well as numeric entities
65   * <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11
66   * </ul>
67   * <p>
68   * The code is based on <A HREF="http://www.javaworld.com/javaworld/javatips/javatip128/">
69   * http://www.javaworld.com/javaworld/javatips/javatip128/</A> with some extra
70   * code from XERCES to recognize the encoding.
71   */
72  public class SimpleXMLParser {
73      private static final HashMap fIANA2JavaMap = new HashMap();
74      private static final HashMap entityMap = new HashMap();
75      
76      private static int popMode(Stack st) {
77          if(!st.empty())
78              return ((Integer)st.pop()).intValue();
79          else
80              return PRE;
81      }
82      
83      private final static int
84      TEXT = 1,
85      ENTITY = 2,
86      OPEN_TAG = 3,
87      CLOSE_TAG = 4,
88      START_TAG = 5,
89      ATTRIBUTE_LVALUE = 6,
90      ATTRIBUTE_EQUAL = 9,
91      ATTRIBUTE_RVALUE = 10,
92      QUOTE = 7,
93      IN_TAG = 8,
94      SINGLE_TAG = 12,
95      COMMENT = 13,
96      DONE = 11,
97      DOCTYPE = 14,
98      PRE = 15,
99      CDATA = 16;
100     
101     private SimpleXMLParser() {
102     }
103     
104     /**
105      * Parses the XML document firing the events to the handler.
106      * @param doc the document handler
107      * @param in the document. The encoding is deduced from the stream. The stream is not closed
108      * @throws IOException on error
109      */    
110     public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException {
111         byte b4[] = new byte[4];
112         int count = in.read(b4);
113         if (count != 4)
114             throw new IOException("Insufficient length.");
115         String encoding = getEncodingName(b4);
116         String decl = null;
117         if (encoding.equals("UTF-8")) {
118             StringBuffer sb = new StringBuffer();
119             int c;
120             while ((c = in.read()) != -1) {
121                 if (c == '>')
122                     break;
123                 sb.append((char)c);
124             }
125             decl = sb.toString();
126         }
127         else if (encoding.equals("CP037")) {
128             ByteArrayOutputStream bi = new ByteArrayOutputStream();
129             int c;
130             while ((c = in.read()) != -1) {
131                 if (c == 0x6e) // that's '>' in ebcdic
132                     break;
133                 bi.write(c);
134             }
135             decl = new String(bi.toByteArray(), "CP037");
136         }
137         if (decl != null) {
138             decl = getDeclaredEncoding(decl);
139             if (decl != null)
140                 encoding = decl;
141         }
142         parse(doc, new InputStreamReader(in, getJavaEncoding(encoding)));
143     }
144     
145     private static String getDeclaredEncoding(String decl) {
146         if (decl == null)
147             return null;
148         int idx = decl.indexOf("encoding");
149         if (idx < 0)
150             return null;
151         int idx1 = decl.indexOf('"', idx);
152         int idx2 = decl.indexOf('\'', idx);
153         if (idx1 == idx2)
154             return null;
155         if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) {
156             int idx3 = decl.indexOf('\'', idx2 + 1);
157             if (idx3 < 0)
158                 return null;
159             return decl.substring(idx2 + 1, idx3);
160         }
161         if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) {
162             int idx3 = decl.indexOf('"', idx1 + 1);
163             if (idx3 < 0)
164                 return null;
165             return decl.substring(idx1 + 1, idx3);
166         }
167         return null;
168     }
169     
170     /**
171      * Gets the java encoding from the IANA encoding. If the encoding cannot be found
172      * it returns the input.
173      * @param iana the IANA encoding
174      * @return the java encoding
175      */    
176     public static String getJavaEncoding(String iana) {
177         String IANA = iana.toUpperCase();
178         String jdec = (String)fIANA2JavaMap.get(IANA);
179         if (jdec == null)
180             jdec = iana;
181         return jdec;
182     }
183     
184     public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException {
185         parse(doc, null, r, false);
186     }
187     
188     /**
189      * Parses the XML document firing the events to the handler.
190      * @param doc the document handler
191      * @param r the document. The encoding is already resolved. The reader is not closed
192      * @throws IOException on error
193      */
194     public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException {
195         BufferedReader reader;
196         if (r instanceof BufferedReader)
197             reader = (BufferedReader)r;
198         else
199             reader = new BufferedReader(r);
200         Stack st = new Stack();
201         int depth = 0;
202         int mode = PRE;
203         int c = 0;
204         int quotec = '"';
205         depth = 0;
206         StringBuffer sb = new StringBuffer();
207         StringBuffer etag = new StringBuffer();
208         String tagName = null;
209         String lvalue = null;
210         String rvalue = null;
211         HashMap attrs = null;
212         st = new Stack();
213         doc.startDocument();
214         int line=1, col=0;
215         boolean eol = false;
216         if (html)
217             mode = TEXT;
218         int pushBack = -1;
219         while(true) {
220             if (pushBack != -1) {
221                 c = pushBack;
222                 pushBack = -1;
223             }
224             else
225                 c = reader.read();
226             if (c == -1)
227                 break;
228             
229             // We need to map \r, \r\n, and \n to \n
230             // See XML spec section 2.11
231             if(c == '\n' && eol) {
232                 eol = false;
233                 continue;
234             } else if(eol) {
235                 eol = false;
236             } else if(c == '\n') {
237                 line++;
238                 col=0;
239             } else if(c == '\r') {
240                 eol = true;
241                 c = '\n';
242                 line++;
243                 col=0;
244             } else {
245                 col++;
246             }
247             
248             if(mode == DONE) {
249                 doc.endDocument();
250                 return;
251                 
252                 // We are between tags collecting text.
253             } else if(mode == TEXT) {
254                 if(c == '<') {
255                     st.push(new Integer(mode));
256                     mode = START_TAG;
257                     if(sb.length() > 0) {
258                         doc.text(sb.toString());
259                         sb.setLength(0);
260                     }
261                 } else if(c == '&') {
262                     st.push(new Integer(mode));
263                     mode = ENTITY;
264                     etag.setLength(0);
265                 } else
266                     sb.append((char)c);
267                 
268                 // we are processing a closing tag: e.g. </foo>
269             } else if(mode == CLOSE_TAG) {
270                 if(c == '>') {
271                     mode = popMode(st);
272                     tagName = sb.toString();
273                     if (html)
274                         tagName = tagName.toLowerCase();
275                     sb.setLength(0);
276                     depth--;
277                     if(!html && depth==0)
278                         mode = DONE;
279                    doc.endElement(tagName);
280                 } else {
281                     if (!Character.isWhitespace((char)c))
282                         sb.append((char)c);
283                 }
284                 
285                 // we are processing CDATA
286             } else if(mode == CDATA) {
287                 if(c == '>'
288                 && sb.toString().endsWith("]]")) {
289                     sb.setLength(sb.length()-2);
290                     doc.text(sb.toString());
291                     sb.setLength(0);
292                     mode = popMode(st);
293                 } else
294                     sb.append((char)c);
295                 
296                 // we are processing a comment.  We are inside
297                 // the <!-- .... --> looking for the -->.
298             } else if(mode == COMMENT) {
299                 if(c == '>'
300                 && sb.toString().endsWith("--")) {
301                     if (comment != null) {
302                         sb.setLength(sb.length() - 2);
303                         comment.comment(sb.toString());
304                     }
305                     sb.setLength(0);
306                     mode = popMode(st);
307                 } else
308                     sb.append((char)c);
309                 
310                 // We are outside the root tag element
311             } else if(mode == PRE) {
312                 if(c == '<') {
313                     mode = TEXT;
314                     st.push(new Integer(mode));
315                     mode = START_TAG;
316                 }
317                 
318                 // We are inside one of these <? ... ?>
319                 // or one of these <!DOCTYPE ... >
320             } else if(mode == DOCTYPE) {
321                 if(c == '>') {
322                     mode = popMode(st);
323                     if(mode == TEXT) mode = PRE;
324                 }
325                 
326                 // we have just seen a < and
327                 // are wondering what we are looking at
328                 // <foo>, </foo>, <!-- ... --->, etc.
329             } else if(mode == START_TAG) {
330                 mode = popMode(st);
331                 if(c == '/') {
332                     st.push(new Integer(mode));
333                     mode = CLOSE_TAG;
334                 } else if (c == '?') {
335                     mode = DOCTYPE;
336                 } else {
337                     st.push(new Integer(mode));
338                     mode = OPEN_TAG;
339                     tagName = null;
340                     attrs = new HashMap();
341                     sb.append((char)c);
342                 }
343                 
344                 // we are processing an entity, e.g. &lt;, &#187;, etc.
345             } else if(mode == ENTITY) {
346                 if(c == ';') {
347                     mode = popMode(st);
348                     String cent = etag.toString();
349                     etag.setLength(0);
350                     if(cent.startsWith("#x")) {
351                         try {
352                             char ci = (char)Integer.parseInt(cent.substring(2),16);
353                             sb.append(ci);
354                         }
355                         catch (Exception es) {
356                             sb.append('&').append(cent).append(';');
357                         }
358                     }
359                     else if(cent.startsWith("#")) {
360                         try {
361                             char ci = (char)Integer.parseInt(cent.substring(1));
362                             sb.append(ci);
363                         }
364                         catch (Exception es) {
365                             sb.append('&').append(cent).append(';');
366                         }
367                     }
368                     else {
369                         char ce = decodeEntity(cent);
370                         if (ce == '\0')
371                             sb.append('&').append(cent).append(';');
372                         else
373                         sb.append(ce);
374                     }
375                 } else if ((c != '#' && (c < '0' || c > '9') && (c < 'a' || c > 'z')
376                     && (c < 'A' || c > 'Z')) || etag.length() >= 7) {
377                     mode = popMode(st);
378                     pushBack = c;
379                     sb.append('&').append(etag.toString());
380                     etag.setLength(0);
381                 }
382                 else {
383                     etag.append((char)c);
384                 }
385                 
386                 // we have just seen something like this:
387                 // <foo a="b"/
388                 // and are looking for the final >.
389             } else if(mode == SINGLE_TAG) {
390                 if(tagName == null)
391                     tagName = sb.toString();
392                 if (html)
393                     tagName = tagName.toLowerCase();
394                 if(c != '>')
395                     exc("Expected > for tag: <"+tagName+"/>",line,col);
396                 doc.startElement(tagName,attrs);
397                 doc.endElement(tagName);
398                 if(!html && depth==0) {
399                     doc.endDocument();
400                     return;
401                 }
402                 sb.setLength(0);
403                 attrs = new HashMap();
404                 tagName = null;
405                 mode = popMode(st);
406                 
407                 // we are processing something
408                 // like this <foo ... >.  It could
409                 // still be a <!-- ... --> or something.
410             } else if(mode == OPEN_TAG) {
411                 if(c == '>') {
412                     if(tagName == null)
413                         tagName = sb.toString();
414                     if (html)
415                         tagName = tagName.toLowerCase();
416                     sb.setLength(0);
417                     depth++;
418                     doc.startElement(tagName,attrs);
419                     tagName = null;
420                     attrs = new HashMap();
421                     mode = popMode(st);
422                 } else if(c == '/') {
423                     mode = SINGLE_TAG;
424                 } else if(c == '-' && sb.toString().equals("!-")) {
425                     mode = COMMENT;
426                     sb.setLength(0);
427                 } else if(c == '[' && sb.toString().equals("![CDATA")) {
428                     mode = CDATA;
429                     sb.setLength(0);
430                 } else if(c == 'E' && sb.toString().equals("!DOCTYP")) {
431                     sb.setLength(0);
432                     mode = DOCTYPE;
433                 } else if(Character.isWhitespace((char)c)) {
434                     tagName = sb.toString();
435                     if (html)
436                         tagName = tagName.toLowerCase();
437                     sb.setLength(0);
438                     mode = IN_TAG;
439                 } else {
440                     sb.append((char)c);
441                 }
442                 
443                 // We are processing the quoted right-hand side
444                 // of an element's attribute.
445             } else if(mode == QUOTE) {
446                 if (html && quotec == ' ' && c == '>') {
447                     rvalue = sb.toString();
448                     sb.setLength(0);
449                     attrs.put(lvalue,rvalue);
450                     mode = popMode(st);
451                     doc.startElement(tagName,attrs);
452                     depth++;
453                     tagName = null;
454                     attrs = new HashMap();
455                 }
456                 else if (html && quotec == ' ' && Character.isWhitespace((char)c)) {
457                     rvalue = sb.toString();
458                     sb.setLength(0);
459                     attrs.put(lvalue,rvalue);
460                     mode = IN_TAG;
461                 }
462                 else if (html && quotec == ' ') {
463                     sb.append((char)c);
464                 }
465                 else if(c == quotec) {
466                     rvalue = sb.toString();
467                     sb.setLength(0);
468                     attrs.put(lvalue,rvalue);
469                     mode = IN_TAG;
470                     // See section the XML spec, section 3.3.3
471                     // on normalization processing.
472                 } else if(" \r\n\u0009".indexOf(c)>=0) {
473                     sb.append(' ');
474                 } else if(c == '&') {
475                     st.push(new Integer(mode));
476                     mode = ENTITY;
477                     etag.setLength(0);
478                 } else {
479                     sb.append((char)c);
480                 }
481                 
482             } else if(mode == ATTRIBUTE_RVALUE) {
483                 if(c == '"' || c == '\'') {
484                     quotec = c;
485                     mode = QUOTE;
486                 } else if(Character.isWhitespace((char)c)) {
487                     ;
488                 } else if (html && c == '>') {
489                     attrs.put(lvalue,sb.toString());
490                     sb.setLength(0);
491                     mode = popMode(st);
492                     doc.startElement(tagName,attrs);
493                     depth++;
494                     tagName = null;
495                     attrs = new HashMap();
496                 } else if (html) {
497                     sb.append((char)c);
498                     quotec = ' ';
499                     mode = QUOTE;
500                 } else {
501                     exc("Error in attribute processing",line,col);
502                 }
503                 
504             } else if(mode == ATTRIBUTE_LVALUE) {
505                 if(Character.isWhitespace((char)c)) {
506                     lvalue = sb.toString();
507                     if (html)
508                         lvalue = lvalue.toLowerCase();
509                     sb.setLength(0);
510                     mode = ATTRIBUTE_EQUAL;
511                 } else if(c == '=') {
512                     lvalue = sb.toString();
513                     if (html)
514                         lvalue = lvalue.toLowerCase();
515                     sb.setLength(0);
516                     mode = ATTRIBUTE_RVALUE;
517                 } else if (html && c == '>') {
518                     sb.setLength(0);
519                     mode = popMode(st);
520                     doc.startElement(tagName,attrs);
521                     depth++;
522                     tagName = null;
523                     attrs = new HashMap();
524                 } else {
525                     sb.append((char)c);
526                 }
527                 
528             } else if(mode == ATTRIBUTE_EQUAL) {
529                 if(c == '=') {
530                     mode = ATTRIBUTE_RVALUE;
531                 } else if(Character.isWhitespace((char)c)) {
532                     ;
533                 } else if (html && c == '>') {
534                     sb.setLength(0);
535                     mode = popMode(st);
536                     doc.startElement(tagName,attrs);
537                     depth++;
538                     tagName = null;
539                     attrs = new HashMap();
540                 } else if (html && c == '/') {
541                     sb.setLength(0);
542                     mode = SINGLE_TAG;
543                 } else if (html) {
544                     sb.setLength(0);
545                     sb.append((char)c);
546                     mode = ATTRIBUTE_LVALUE;
547                 } else {
548                     exc("Error in attribute processing.",line,col);
549                 }
550                 
551             } else if(mode == IN_TAG) {
552                 if(c == '>') {
553                     mode = popMode(st);
554                     doc.startElement(tagName,attrs);
555                     depth++;
556                     tagName = null;
557                     attrs = new HashMap();
558                 } else if(c == '/') {
559                     mode = SINGLE_TAG;
560                 } else if(Character.isWhitespace((char)c)) {
561                     ;
562                 } else {
563                     mode = ATTRIBUTE_LVALUE;
564                     sb.append((char)c);
565                 }
566             }
567         }
568         if(html || mode == DONE) {
569             if (html && mode == TEXT)
570                 doc.text(sb.toString());
571             doc.endDocument();
572         }
573         else
574             exc("missing end tag",line,col);
575     }
576     private static void exc(String s,int line,int col) throws IOException {
577         throw new IOException(s+" near line "+line+", column "+col);
578     }
579     
580     /**
581      * Escapes a string with the appropriated XML codes.
582      * @param s the string to be escaped
583      * @param onlyASCII codes above 127 will always be escaped with &amp;#nn; if <CODE>true</CODE>
584      * @return the escaped string
585      */    
586     public static String escapeXML(String s, boolean onlyASCII) {
587         char cc[] = s.toCharArray();
588         int len = cc.length;
589         StringBuffer sb = new StringBuffer();
590         for (int k = 0; k < len; ++k) {
591             int c = cc[k];
592             switch (c) {
593                 case '<':
594                     sb.append("&lt;");
595                     break;
596                 case '>':
597                     sb.append("&gt;");
598                     break;
599                 case '&':
600                     sb.append("&amp;");
601                     break;
602                 case '"':
603                     sb.append("&quot;");
604                     break;
605                 case '\'':
606                     sb.append("&apos;");
607                     break;
608                 default:
609                     if (onlyASCII && c > 127)
610                         sb.append("&#").append(c).append(";");
611                     else
612                         sb.append((char)c);
613             }
614         }
615         return sb.toString();
616     }
617     
618     public static char decodeEntity(String s) {
619         Character c = (Character)entityMap.get(s);
620         if (c == null)
621             return '\0';
622         else
623             return c.charValue();
624     }
625     
626     private static String getEncodingName(byte[] b4) {
627         
628         // UTF-16, with BOM
629         int b0 = b4[0] & 0xFF;
630         int b1 = b4[1] & 0xFF;
631         if (b0 == 0xFE && b1 == 0xFF) {
632             // UTF-16, big-endian
633             return "UTF-16BE";
634         }
635         if (b0 == 0xFF && b1 == 0xFE) {
636             // UTF-16, little-endian
637             return "UTF-16LE";
638         }
639         
640         // UTF-8 with a BOM
641         int b2 = b4[2] & 0xFF;
642         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
643             return "UTF-8";
644         }
645         
646         // other encodings
647         int b3 = b4[3] & 0xFF;
648         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
649             // UCS-4, big endian (1234)
650             return "ISO-10646-UCS-4";
651         }
652         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
653             // UCS-4, little endian (4321)
654             return "ISO-10646-UCS-4";
655         }
656         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
657             // UCS-4, unusual octet order (2143)
658             // REVISIT: What should this be?
659             return "ISO-10646-UCS-4";
660         }
661         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
662             // UCS-4, unusual octect order (3412)
663             // REVISIT: What should this be?
664             return "ISO-10646-UCS-4";
665         }
666         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
667             // UTF-16, big-endian, no BOM
668             // (or could turn out to be UCS-2...
669             // REVISIT: What should this be?
670             return "UTF-16BE";
671         }
672         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
673             // UTF-16, little-endian, no BOM
674             // (or could turn out to be UCS-2...
675             return "UTF-16LE";
676         }
677         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
678             // EBCDIC
679             // a la xerces1, return CP037 instead of EBCDIC here
680             return "CP037";
681         }
682         
683         // default encoding
684         return "UTF-8";
685     }
686 
687     static {        
688         // add IANA to Java encoding mappings.
689         fIANA2JavaMap.put("BIG5", "Big5");
690         fIANA2JavaMap.put("CSBIG5", "Big5");
691         fIANA2JavaMap.put("CP037", "CP037");
692         fIANA2JavaMap.put("IBM037", "CP037");
693         fIANA2JavaMap.put("CSIBM037", "CP037");
694         fIANA2JavaMap.put("EBCDIC-CP-US", "CP037");
695         fIANA2JavaMap.put("EBCDIC-CP-CA", "CP037");
696         fIANA2JavaMap.put("EBCDIC-CP-NL", "CP037");
697         fIANA2JavaMap.put("EBCDIC-CP-WT", "CP037");
698         fIANA2JavaMap.put("IBM277", "CP277");
699         fIANA2JavaMap.put("CP277", "CP277");
700         fIANA2JavaMap.put("CSIBM277", "CP277");
701         fIANA2JavaMap.put("EBCDIC-CP-DK", "CP277");
702         fIANA2JavaMap.put("EBCDIC-CP-NO", "CP277");
703         fIANA2JavaMap.put("IBM278", "CP278");
704         fIANA2JavaMap.put("CP278", "CP278");
705         fIANA2JavaMap.put("CSIBM278", "CP278");
706         fIANA2JavaMap.put("EBCDIC-CP-FI", "CP278");
707         fIANA2JavaMap.put("EBCDIC-CP-SE", "CP278");
708         fIANA2JavaMap.put("IBM280", "CP280");
709         fIANA2JavaMap.put("CP280", "CP280");
710         fIANA2JavaMap.put("CSIBM280", "CP280");
711         fIANA2JavaMap.put("EBCDIC-CP-IT", "CP280");
712         fIANA2JavaMap.put("IBM284", "CP284");
713         fIANA2JavaMap.put("CP284", "CP284");
714         fIANA2JavaMap.put("CSIBM284", "CP284");
715         fIANA2JavaMap.put("EBCDIC-CP-ES", "CP284");
716         fIANA2JavaMap.put("EBCDIC-CP-GB", "CP285");
717         fIANA2JavaMap.put("IBM285", "CP285");
718         fIANA2JavaMap.put("CP285", "CP285");
719         fIANA2JavaMap.put("CSIBM285", "CP285");
720         fIANA2JavaMap.put("EBCDIC-CP-FR", "CP297");
721         fIANA2JavaMap.put("IBM297", "CP297");
722         fIANA2JavaMap.put("CP297", "CP297");
723         fIANA2JavaMap.put("CSIBM297", "CP297");
724         fIANA2JavaMap.put("EBCDIC-CP-AR1", "CP420");
725         fIANA2JavaMap.put("IBM420", "CP420");
726         fIANA2JavaMap.put("CP420", "CP420");
727         fIANA2JavaMap.put("CSIBM420", "CP420");
728         fIANA2JavaMap.put("EBCDIC-CP-HE", "CP424");
729         fIANA2JavaMap.put("IBM424", "CP424");
730         fIANA2JavaMap.put("CP424", "CP424");
731         fIANA2JavaMap.put("CSIBM424", "CP424");
732         fIANA2JavaMap.put("EBCDIC-CP-CH", "CP500");
733         fIANA2JavaMap.put("IBM500", "CP500");
734         fIANA2JavaMap.put("CP500", "CP500");
735         fIANA2JavaMap.put("CSIBM500", "CP500");
736         fIANA2JavaMap.put("EBCDIC-CP-CH", "CP500");
737         fIANA2JavaMap.put("EBCDIC-CP-BE", "CP500");
738         fIANA2JavaMap.put("IBM868", "CP868");
739         fIANA2JavaMap.put("CP868", "CP868");
740         fIANA2JavaMap.put("CSIBM868", "CP868");
741         fIANA2JavaMap.put("CP-AR", "CP868");
742         fIANA2JavaMap.put("IBM869", "CP869");
743         fIANA2JavaMap.put("CP869", "CP869");
744         fIANA2JavaMap.put("CSIBM869", "CP869");
745         fIANA2JavaMap.put("CP-GR", "CP869");
746         fIANA2JavaMap.put("IBM870", "CP870");
747         fIANA2JavaMap.put("CP870", "CP870");
748         fIANA2JavaMap.put("CSIBM870", "CP870");
749         fIANA2JavaMap.put("EBCDIC-CP-ROECE", "CP870");
750         fIANA2JavaMap.put("EBCDIC-CP-YU", "CP870");
751         fIANA2JavaMap.put("IBM871", "CP871");
752         fIANA2JavaMap.put("CP871", "CP871");
753         fIANA2JavaMap.put("CSIBM871", "CP871");
754         fIANA2JavaMap.put("EBCDIC-CP-IS", "CP871");
755         fIANA2JavaMap.put("IBM918", "CP918");
756         fIANA2JavaMap.put("CP918", "CP918");
757         fIANA2JavaMap.put("CSIBM918", "CP918");
758         fIANA2JavaMap.put("EBCDIC-CP-AR2", "CP918");
759         fIANA2JavaMap.put("EUC-JP", "EUCJIS");
760         fIANA2JavaMap.put("CSEUCPkdFmtJapanese", "EUCJIS");
761         fIANA2JavaMap.put("EUC-KR", "KSC5601");
762         fIANA2JavaMap.put("GB2312", "GB2312");
763         fIANA2JavaMap.put("CSGB2312", "GB2312");
764         fIANA2JavaMap.put("ISO-2022-JP", "JIS");
765         fIANA2JavaMap.put("CSISO2022JP", "JIS");
766         fIANA2JavaMap.put("ISO-2022-KR", "ISO2022KR");
767         fIANA2JavaMap.put("CSISO2022KR", "ISO2022KR");
768         fIANA2JavaMap.put("ISO-2022-CN", "ISO2022CN");
769         
770         fIANA2JavaMap.put("X0201", "JIS0201");
771         fIANA2JavaMap.put("CSISO13JISC6220JP", "JIS0201");
772         fIANA2JavaMap.put("X0208", "JIS0208");
773         fIANA2JavaMap.put("ISO-IR-87", "JIS0208");
774         fIANA2JavaMap.put("X0208dbiJIS_X0208-1983", "JIS0208");
775         fIANA2JavaMap.put("CSISO87JISX0208", "JIS0208");
776         fIANA2JavaMap.put("X0212", "JIS0212");
777         fIANA2JavaMap.put("ISO-IR-159", "JIS0212");
778         fIANA2JavaMap.put("CSISO159JISX02121990", "JIS0212");
779         fIANA2JavaMap.put("SHIFT_JIS", "SJIS");
780         fIANA2JavaMap.put("CSSHIFT_JIS", "SJIS");
781         fIANA2JavaMap.put("MS_Kanji", "SJIS");
782         
783         // Add support for Cp1252 and its friends
784         fIANA2JavaMap.put("WINDOWS-1250", "Cp1250");
785         fIANA2JavaMap.put("WINDOWS-1251", "Cp1251");
786         fIANA2JavaMap.put("WINDOWS-1252", "Cp1252");
787         fIANA2JavaMap.put("WINDOWS-1253", "Cp1253");
788         fIANA2JavaMap.put("WINDOWS-1254", "Cp1254");
789         fIANA2JavaMap.put("WINDOWS-1255", "Cp1255");
790         fIANA2JavaMap.put("WINDOWS-1256", "Cp1256");
791         fIANA2JavaMap.put("WINDOWS-1257", "Cp1257");
792         fIANA2JavaMap.put("WINDOWS-1258", "Cp1258");
793         fIANA2JavaMap.put("TIS-620", "TIS620");
794         
795         fIANA2JavaMap.put("ISO-8859-1", "ISO8859_1");
796         fIANA2JavaMap.put("ISO-IR-100", "ISO8859_1");
797         fIANA2JavaMap.put("ISO_8859-1", "ISO8859_1");
798         fIANA2JavaMap.put("LATIN1", "ISO8859_1");
799         fIANA2JavaMap.put("CSISOLATIN1", "ISO8859_1");
800         fIANA2JavaMap.put("L1", "ISO8859_1");
801         fIANA2JavaMap.put("IBM819", "ISO8859_1");
802         fIANA2JavaMap.put("CP819", "ISO8859_1");
803         
804         fIANA2JavaMap.put("ISO-8859-2", "ISO8859_2");
805         fIANA2JavaMap.put("ISO-IR-101", "ISO8859_2");
806         fIANA2JavaMap.put("ISO_8859-2", "ISO8859_2");
807         fIANA2JavaMap.put("LATIN2", "ISO8859_2");
808         fIANA2JavaMap.put("CSISOLATIN2", "ISO8859_2");
809         fIANA2JavaMap.put("L2", "ISO8859_2");
810         
811         fIANA2JavaMap.put("ISO-8859-3", "ISO8859_3");
812         fIANA2JavaMap.put("ISO-IR-109", "ISO8859_3");
813         fIANA2JavaMap.put("ISO_8859-3", "ISO8859_3");
814         fIANA2JavaMap.put("LATIN3", "ISO8859_3");
815         fIANA2JavaMap.put("CSISOLATIN3", "ISO8859_3");
816         fIANA2JavaMap.put("L3", "ISO8859_3");
817         
818         fIANA2JavaMap.put("ISO-8859-4", "ISO8859_4");
819         fIANA2JavaMap.put("ISO-IR-110", "ISO8859_4");
820         fIANA2JavaMap.put("ISO_8859-4", "ISO8859_4");
821         fIANA2JavaMap.put("LATIN4", "ISO8859_4");
822         fIANA2JavaMap.put("CSISOLATIN4", "ISO8859_4");
823         fIANA2JavaMap.put("L4", "ISO8859_4");
824         
825         fIANA2JavaMap.put("ISO-8859-5", "ISO8859_5");
826         fIANA2JavaMap.put("ISO-IR-144", "ISO8859_5");
827         fIANA2JavaMap.put("ISO_8859-5", "ISO8859_5");
828         fIANA2JavaMap.put("CYRILLIC", "ISO8859_5");
829         fIANA2JavaMap.put("CSISOLATINCYRILLIC", "ISO8859_5");
830         
831         fIANA2JavaMap.put("ISO-8859-6", "ISO8859_6");
832         fIANA2JavaMap.put("ISO-IR-127", "ISO8859_6");
833         fIANA2JavaMap.put("ISO_8859-6", "ISO8859_6");
834         fIANA2JavaMap.put("ECMA-114", "ISO8859_6");
835         fIANA2JavaMap.put("ASMO-708", "ISO8859_6");
836         fIANA2JavaMap.put("ARABIC", "ISO8859_6");
837         fIANA2JavaMap.put("CSISOLATINARABIC", "ISO8859_6");
838         
839         fIANA2JavaMap.put("ISO-8859-7", "ISO8859_7");
840         fIANA2JavaMap.put("ISO-IR-126", "ISO8859_7");
841         fIANA2JavaMap.put("ISO_8859-7", "ISO8859_7");
842         fIANA2JavaMap.put("ELOT_928", "ISO8859_7");
843         fIANA2JavaMap.put("ECMA-118", "ISO8859_7");
844         fIANA2JavaMap.put("GREEK", "ISO8859_7");
845         fIANA2JavaMap.put("CSISOLATINGREEK", "ISO8859_7");
846         fIANA2JavaMap.put("GREEK8", "ISO8859_7");
847         
848         fIANA2JavaMap.put("ISO-8859-8", "ISO8859_8");
849         fIANA2JavaMap.put("ISO-8859-8-I", "ISO8859_8"); // added since this encoding only differs w.r.t. presentation
850         fIANA2JavaMap.put("ISO-IR-138", "ISO8859_8");
851         fIANA2JavaMap.put("ISO_8859-8", "ISO8859_8");
852         fIANA2JavaMap.put("HEBREW", "ISO8859_8");
853         fIANA2JavaMap.put("CSISOLATINHEBREW", "ISO8859_8");
854         
855         fIANA2JavaMap.put("ISO-8859-9", "ISO8859_9");
856         fIANA2JavaMap.put("ISO-IR-148", "ISO8859_9");
857         fIANA2JavaMap.put("ISO_8859-9", "ISO8859_9");
858         fIANA2JavaMap.put("LATIN5", "ISO8859_9");
859         fIANA2JavaMap.put("CSISOLATIN5", "ISO8859_9");
860         fIANA2JavaMap.put("L5", "ISO8859_9");
861         
862         fIANA2JavaMap.put("KOI8-R", "KOI8_R");
863         fIANA2JavaMap.put("CSKOI8-R", "KOI8_R");
864         fIANA2JavaMap.put("US-ASCII", "ASCII");
865         fIANA2JavaMap.put("ISO-IR-6", "ASCII");
866         fIANA2JavaMap.put("ANSI_X3.4-1986", "ASCII");
867         fIANA2JavaMap.put("ISO_646.IRV:1991", "ASCII");
868         fIANA2JavaMap.put("ASCII", "ASCII");
869         fIANA2JavaMap.put("CSASCII", "ASCII");
870         fIANA2JavaMap.put("ISO646-US", "ASCII");
871         fIANA2JavaMap.put("US", "ASCII");
872         fIANA2JavaMap.put("IBM367", "ASCII");
873         fIANA2JavaMap.put("CP367", "ASCII");
874         fIANA2JavaMap.put("UTF-8", "UTF8");
875         fIANA2JavaMap.put("UTF-16", "Unicode");
876         fIANA2JavaMap.put("UTF-16BE", "UnicodeBig");
877         fIANA2JavaMap.put("UTF-16LE", "UnicodeLittle");
878 
879         entityMap.put("nbsp", new Character('\u00a0')); // no-break space = non-breaking space, U+00A0 ISOnum
880         entityMap.put("iexcl", new Character('\u00a1')); // inverted exclamation mark, U+00A1 ISOnum
881         entityMap.put("cent", new Character('\u00a2')); // cent sign, U+00A2 ISOnum
882         entityMap.put("pound", new Character('\u00a3')); // pound sign, U+00A3 ISOnum
883         entityMap.put("curren", new Character('\u00a4')); // currency sign, U+00A4 ISOnum
884         entityMap.put("yen", new Character('\u00a5')); // yen sign = yuan sign, U+00A5 ISOnum
885         entityMap.put("brvbar", new Character('\u00a6')); // broken bar = broken vertical bar, U+00A6 ISOnum
886         entityMap.put("sect", new Character('\u00a7')); // section sign, U+00A7 ISOnum
887         entityMap.put("uml", new Character('\u00a8')); // diaeresis = spacing diaeresis, U+00A8 ISOdia
888         entityMap.put("copy", new Character('\u00a9')); // copyright sign, U+00A9 ISOnum
889         entityMap.put("ordf", new Character('\u00aa')); // feminine ordinal indicator, U+00AA ISOnum
890         entityMap.put("laquo", new Character('\u00ab')); // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
891         entityMap.put("not", new Character('\u00ac')); // not sign, U+00AC ISOnum
892         entityMap.put("shy", new Character('\u00ad')); // soft hyphen = discretionary hyphen, U+00AD ISOnum
893         entityMap.put("reg", new Character('\u00ae')); // registered sign = registered trade mark sign, U+00AE ISOnum
894         entityMap.put("macr", new Character('\u00af')); // macron = spacing macron = overline = APL overbar, U+00AF ISOdia
895         entityMap.put("deg", new Character('\u00b0')); // degree sign, U+00B0 ISOnum
896         entityMap.put("plusmn", new Character('\u00b1')); // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
897         entityMap.put("sup2", new Character('\u00b2')); // superscript two = superscript digit two = squared, U+00B2 ISOnum
898         entityMap.put("sup3", new Character('\u00b3')); // superscript three = superscript digit three = cubed, U+00B3 ISOnum
899         entityMap.put("acute", new Character('\u00b4')); // acute accent = spacing acute, U+00B4 ISOdia
900         entityMap.put("micro", new Character('\u00b5')); // micro sign, U+00B5 ISOnum
901         entityMap.put("para", new Character('\u00b6')); // pilcrow sign = paragraph sign, U+00B6 ISOnum
902         entityMap.put("middot", new Character('\u00b7')); // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
903         entityMap.put("cedil", new Character('\u00b8')); // cedilla = spacing cedilla, U+00B8 ISOdia
904         entityMap.put("sup1", new Character('\u00b9')); // superscript one = superscript digit one, U+00B9 ISOnum
905         entityMap.put("ordm", new Character('\u00ba')); // masculine ordinal indicator, U+00BA ISOnum
906         entityMap.put("raquo", new Character('\u00bb')); // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
907         entityMap.put("frac14", new Character('\u00bc')); // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
908         entityMap.put("frac12", new Character('\u00bd')); // vulgar fraction one half = fraction one half, U+00BD ISOnum
909         entityMap.put("frac34", new Character('\u00be')); // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
910         entityMap.put("iquest", new Character('\u00bf')); // inverted question mark = turned question mark, U+00BF ISOnum
911         entityMap.put("Agrave", new Character('\u00c0')); // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
912         entityMap.put("Aacute", new Character('\u00c1')); // latin capital letter A with acute, U+00C1 ISOlat1
913         entityMap.put("Acirc", new Character('\u00c2')); // latin capital letter A with circumflex, U+00C2 ISOlat1
914         entityMap.put("Atilde", new Character('\u00c3')); // latin capital letter A with tilde, U+00C3 ISOlat1
915         entityMap.put("Auml", new Character('\u00c4')); // latin capital letter A with diaeresis, U+00C4 ISOlat1
916         entityMap.put("Aring", new Character('\u00c5')); // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1
917         entityMap.put("AElig", new Character('\u00c6')); // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
918         entityMap.put("Ccedil", new Character('\u00c7')); // latin capital letter C with cedilla, U+00C7 ISOlat1
919         entityMap.put("Egrave", new Character('\u00c8')); // latin capital letter E with grave, U+00C8 ISOlat1
920         entityMap.put("Eacute", new Character('\u00c9')); // latin capital letter E with acute, U+00C9 ISOlat1
921         entityMap.put("Ecirc", new Character('\u00ca')); // latin capital letter E with circumflex, U+00CA ISOlat1
922         entityMap.put("Euml", new Character('\u00cb')); // latin capital letter E with diaeresis, U+00CB ISOlat1
923         entityMap.put("Igrave", new Character('\u00cc')); // latin capital letter I with grave, U+00CC ISOlat1
924         entityMap.put("Iacute", new Character('\u00cd')); // latin capital letter I with acute, U+00CD ISOlat1
925         entityMap.put("Icirc", new Character('\u00ce')); // latin capital letter I with circumflex, U+00CE ISOlat1
926         entityMap.put("Iuml", new Character('\u00cf')); // latin capital letter I with diaeresis, U+00CF ISOlat1
927         entityMap.put("ETH", new Character('\u00d0')); // latin capital letter ETH, U+00D0 ISOlat1
928         entityMap.put("Ntilde", new Character('\u00d1')); // latin capital letter N with tilde, U+00D1 ISOlat1
929         entityMap.put("Ograve", new Character('\u00d2')); // latin capital letter O with grave, U+00D2 ISOlat1
930         entityMap.put("Oacute", new Character('\u00d3')); // latin capital letter O with acute, U+00D3 ISOlat1
931         entityMap.put("Ocirc", new Character('\u00d4')); // latin capital letter O with circumflex, U+00D4 ISOlat1
932         entityMap.put("Otilde", new Character('\u00d5')); // latin capital letter O with tilde, U+00D5 ISOlat1
933         entityMap.put("Ouml", new Character('\u00d6')); // latin capital letter O with diaeresis, U+00D6 ISOlat1
934         entityMap.put("times", new Character('\u00d7')); // multiplication sign, U+00D7 ISOnum
935         entityMap.put("Oslash", new Character('\u00d8')); // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
936         entityMap.put("Ugrave", new Character('\u00d9')); // latin capital letter U with grave, U+00D9 ISOlat1
937         entityMap.put("Uacute", new Character('\u00da')); // latin capital letter U with acute, U+00DA ISOlat1
938         entityMap.put("Ucirc", new Character('\u00db')); // latin capital letter U with circumflex, U+00DB ISOlat1
939         entityMap.put("Uuml", new Character('\u00dc')); // latin capital letter U with diaeresis, U+00DC ISOlat1
940         entityMap.put("Yacute", new Character('\u00dd')); // latin capital letter Y with acute, U+00DD ISOlat1
941         entityMap.put("THORN", new Character('\u00de')); // latin capital letter THORN, U+00DE ISOlat1
942         entityMap.put("szlig", new Character('\u00df')); // latin small letter sharp s = ess-zed, U+00DF ISOlat1
943         entityMap.put("agrave", new Character('\u00e0')); // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
944         entityMap.put("aacute", new Character('\u00e1')); // latin small letter a with acute, U+00E1 ISOlat1
945         entityMap.put("acirc", new Character('\u00e2')); // latin small letter a with circumflex, U+00E2 ISOlat1
946         entityMap.put("atilde", new Character('\u00e3')); // latin small letter a with tilde, U+00E3 ISOlat1
947         entityMap.put("auml", new Character('\u00e4')); // latin small letter a with diaeresis, U+00E4 ISOlat1
948         entityMap.put("aring", new Character('\u00e5')); // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1
949         entityMap.put("aelig", new Character('\u00e6')); // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
950         entityMap.put("ccedil", new Character('\u00e7')); // latin small letter c with cedilla, U+00E7 ISOlat1
951         entityMap.put("egrave", new Character('\u00e8')); // latin small letter e with grave, U+00E8 ISOlat1
952         entityMap.put("eacute", new Character('\u00e9')); // latin small letter e with acute, U+00E9 ISOlat1
953         entityMap.put("ecirc", new Character('\u00ea')); // latin small letter e with circumflex, U+00EA ISOlat1
954         entityMap.put("euml", new Character('\u00eb')); // latin small letter e with diaeresis, U+00EB ISOlat1
955         entityMap.put("igrave", new Character('\u00ec')); // latin small letter i with grave, U+00EC ISOlat1
956         entityMap.put("iacute", new Character('\u00ed')); // latin small letter i with acute, U+00ED ISOlat1
957         entityMap.put("icirc", new Character('\u00ee')); // latin small letter i with circumflex, U+00EE ISOlat1
958         entityMap.put("iuml", new Character('\u00ef')); // latin small letter i with diaeresis, U+00EF ISOlat1
959         entityMap.put("eth", new Character('\u00f0')); // latin small letter eth, U+00F0 ISOlat1
960         entityMap.put("ntilde", new Character('\u00f1')); // latin small letter n with tilde, U+00F1 ISOlat1
961         entityMap.put("ograve", new Character('\u00f2')); // latin small letter o with grave, U+00F2 ISOlat1
962         entityMap.put("oacute", new Character('\u00f3')); // latin small letter o with acute, U+00F3 ISOlat1
963         entityMap.put("ocirc", new Character('\u00f4')); // latin small letter o with circumflex, U+00F4 ISOlat1
964         entityMap.put("otilde", new Character('\u00f5')); // latin small letter o with tilde, U+00F5 ISOlat1
965         entityMap.put("ouml", new Character('\u00f6')); // latin small letter o with diaeresis, U+00F6 ISOlat1
966         entityMap.put("divide", new Character('\u00f7')); // division sign, U+00F7 ISOnum
967         entityMap.put("oslash", new Character('\u00f8')); // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
968         entityMap.put("ugrave", new Character('\u00f9')); // latin small letter u with grave, U+00F9 ISOlat1
969         entityMap.put("uacute", new Character('\u00fa')); // latin small letter u with acute, U+00FA ISOlat1
970         entityMap.put("ucirc", new Character('\u00fb')); // latin small letter u with circumflex, U+00FB ISOlat1
971         entityMap.put("uuml", new Character('\u00fc')); // latin small letter u with diaeresis, U+00FC ISOlat1
972         entityMap.put("yacute", new Character('\u00fd')); // latin small letter y with acute, U+00FD ISOlat1
973         entityMap.put("thorn", new Character('\u00fe')); // latin small letter thorn, U+00FE ISOlat1
974         entityMap.put("yuml", new Character('\u00ff')); // latin small letter y with diaeresis, U+00FF ISOlat1
975         // Latin Extended-B
976         entityMap.put("fnof", new Character('\u0192')); // latin small f with hook = function = florin, U+0192 ISOtech
977         // Greek
978         entityMap.put("Alpha", new Character('\u0391')); // greek capital letter alpha, U+0391
979         entityMap.put("Beta", new Character('\u0392')); // greek capital letter beta, U+0392
980         entityMap.put("Gamma", new Character('\u0393')); // greek capital letter gamma, U+0393 ISOgrk3
981         entityMap.put("Delta", new Character('\u0394')); // greek capital letter delta, U+0394 ISOgrk3
982         entityMap.put("Epsilon", new Character('\u0395')); // greek capital letter epsilon, U+0395
983         entityMap.put("Zeta", new Character('\u0396')); // greek capital letter zeta, U+0396
984         entityMap.put("Eta", new Character('\u0397')); // greek capital letter eta, U+0397
985         entityMap.put("Theta", new Character('\u0398')); // greek capital letter theta, U+0398 ISOgrk3
986         entityMap.put("Iota", new Character('\u0399')); // greek capital letter iota, U+0399
987         entityMap.put("Kappa", new Character('\u039a')); // greek capital letter kappa, U+039A
988         entityMap.put("Lambda", new Character('\u039b')); // greek capital letter lambda, U+039B ISOgrk3
989         entityMap.put("Mu", new Character('\u039c')); // greek capital letter mu, U+039C
990         entityMap.put("Nu", new Character('\u039d')); // greek capital letter nu, U+039D
991         entityMap.put("Xi", new Character('\u039e')); // greek capital letter xi, U+039E ISOgrk3
992         entityMap.put("Omicron", new Character('\u039f')); // greek capital letter omicron, U+039F
993         entityMap.put("Pi", new Character('\u03a0')); // greek capital letter pi, U+03A0 ISOgrk3
994         entityMap.put("Rho", new Character('\u03a1')); // greek capital letter rho, U+03A1
995         // there is no Sigmaf, and no U+03A2 character either
996         entityMap.put("Sigma", new Character('\u03a3')); // greek capital letter sigma, U+03A3 ISOgrk3
997         entityMap.put("Tau", new Character('\u03a4')); // greek capital letter tau, U+03A4
998         entityMap.put("Upsilon", new Character('\u03a5')); // greek capital letter upsilon, U+03A5 ISOgrk3
999         entityMap.put("Phi", new Character('\u03a6')); // greek capital letter phi, U+03A6 ISOgrk3
1000        entityMap.put("Chi", new Character('\u03a7')); // greek capital letter chi, U+03A7
1001        entityMap.put("Psi", new Character('\u03a8')); // greek capital letter psi, U+03A8 ISOgrk3
1002        entityMap.put("Omega", new Character('\u03a9')); // greek capital letter omega, U+03A9 ISOgrk3
1003        entityMap.put("alpha", new Character('\u03b1')); // greek small letter alpha, U+03B1 ISOgrk3
1004        entityMap.put("beta", new Character('\u03b2')); // greek small letter beta, U+03B2 ISOgrk3
1005        entityMap.put("gamma", new Character('\u03b3')); // greek small letter gamma, U+03B3 ISOgrk3
1006        entityMap.put("delta", new Character('\u03b4')); // greek small letter delta, U+03B4 ISOgrk3
1007        entityMap.put("epsilon", new Character('\u03b5')); // greek small letter epsilon, U+03B5 ISOgrk3
1008        entityMap.put("zeta", new Character('\u03b6')); // greek small letter zeta, U+03B6 ISOgrk3
1009        entityMap.put("eta", new Character('\u03b7')); // greek small letter eta, U+03B7 ISOgrk3
1010        entityMap.put("theta", new Character('\u03b8')); // greek small letter theta, U+03B8 ISOgrk3
1011        entityMap.put("iota", new Character('\u03b9')); // greek small letter iota, U+03B9 ISOgrk3
1012        entityMap.put("kappa", new Character('\u03ba')); // greek small letter kappa, U+03BA ISOgrk3
1013        entityMap.put("lambda", new Character('\u03bb')); // greek small letter lambda, U+03BB ISOgrk3
1014        entityMap.put("mu", new Character('\u03bc')); // greek small letter mu, U+03BC ISOgrk3
1015        entityMap.put("nu", new Character('\u03bd')); // greek small letter nu, U+03BD ISOgrk3
1016        entityMap.put("xi", new Character('\u03be')); // greek small letter xi, U+03BE ISOgrk3
1017        entityMap.put("omicron", new Character('\u03bf')); // greek small letter omicron, U+03BF NEW
1018        entityMap.put("pi", new Character('\u03c0')); // greek small letter pi, U+03C0 ISOgrk3
1019        entityMap.put("rho", new Character('\u03c1')); // greek small letter rho, U+03C1 ISOgrk3
1020        entityMap.put("sigmaf", new Character('\u03c2')); // greek small letter final sigma, U+03C2 ISOgrk3
1021        entityMap.put("sigma", new Character('\u03c3')); // greek small letter sigma, U+03C3 ISOgrk3
1022        entityMap.put("tau", new Character('\u03c4')); // greek small letter tau, U+03C4 ISOgrk3
1023        entityMap.put("upsilon", new Character('\u03c5')); // greek small letter upsilon, U+03C5 ISOgrk3
1024        entityMap.put("phi", new Character('\u03c6')); // greek small letter phi, U+03C6 ISOgrk3
1025        entityMap.put("chi", new Character('\u03c7')); // greek small letter chi, U+03C7 ISOgrk3
1026        entityMap.put("psi", new Character('\u03c8')); // greek small letter psi, U+03C8 ISOgrk3
1027        entityMap.put("omega", new Character('\u03c9')); // greek small letter omega, U+03C9 ISOgrk3
1028        entityMap.put("thetasym", new Character('\u03d1')); // greek small letter theta symbol, U+03D1 NEW
1029        entityMap.put("upsih", new Character('\u03d2')); // greek upsilon with hook symbol, U+03D2 NEW
1030        entityMap.put("piv", new Character('\u03d6')); // greek pi symbol, U+03D6 ISOgrk3
1031        // General Punctuation
1032        entityMap.put("bull", new Character('\u2022')); // bullet = black small circle, U+2022 ISOpub
1033        // bullet is NOT the same as bullet operator, U+2219
1034        entityMap.put("hellip", new Character('\u2026')); // horizontal ellipsis = three dot leader, U+2026 ISOpub
1035        entityMap.put("prime", new Character('\u2032')); // prime = minutes = feet, U+2032 ISOtech
1036        entityMap.put("Prime", new Character('\u2033')); // double prime = seconds = inches, U+2033 ISOtech
1037        entityMap.put("oline", new Character('\u203e')); // overline = spacing overscore, U+203E NEW
1038        entityMap.put("frasl", new Character('\u2044')); // fraction slash, U+2044 NEW
1039        // Letterlike Symbols
1040        entityMap.put("weierp", new Character('\u2118')); // script capital P = power set = Weierstrass p, U+2118 ISOamso
1041        entityMap.put("image", new Character('\u2111')); // blackletter capital I = imaginary part, U+2111 ISOamso
1042        entityMap.put("real", new Character('\u211c')); // blackletter capital R = real part symbol, U+211C ISOamso
1043        entityMap.put("trade", new Character('\u2122')); // trade mark sign, U+2122 ISOnum
1044        entityMap.put("alefsym", new Character('\u2135')); // alef symbol = first transfinite cardinal, U+2135 NEW
1045        // alef symbol is NOT the same as hebrew letter alef,
1046        // U+05D0 although the same glyph could be used to depict both characters
1047        // Arrows
1048        entityMap.put("larr", new Character('\u2190')); // leftwards arrow, U+2190 ISOnum
1049        entityMap.put("uarr", new Character('\u2191')); // upwards arrow, U+2191 ISOnum
1050        entityMap.put("rarr", new Character('\u2192')); // rightwards arrow, U+2192 ISOnum
1051        entityMap.put("darr", new Character('\u2193')); // downwards arrow, U+2193 ISOnum
1052        entityMap.put("harr", new Character('\u2194')); // left right arrow, U+2194 ISOamsa
1053        entityMap.put("crarr", new Character('\u21b5')); // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
1054        entityMap.put("lArr", new Character('\u21d0')); // leftwards double arrow, U+21D0 ISOtech
1055        // ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
1056        // but also does not have any other character for that function. So ? lArr can
1057        // be used for 'is implied by' as ISOtech suggests
1058        entityMap.put("uArr", new Character('\u21d1')); // upwards double arrow, U+21D1 ISOamsa
1059        entityMap.put("rArr", new Character('\u21d2')); // rightwards double arrow, U+21D2 ISOtech
1060        // ISO 10646 does not say this is the 'implies' character but does not have 
1061        // another character with this function so ?
1062        // rArr can be used for 'implies' as ISOtech suggests
1063        entityMap.put("dArr", new Character('\u21d3')); // downwards double arrow, U+21D3 ISOamsa
1064        entityMap.put("hArr", new Character('\u21d4')); // left right double arrow, U+21D4 ISOamsa
1065        // Mathematical Operators
1066        entityMap.put("forall", new Character('\u2200')); // for all, U+2200 ISOtech
1067        entityMap.put("part", new Character('\u2202')); // partial differential, U+2202 ISOtech
1068        entityMap.put("exist", new Character('\u2203')); // there exists, U+2203 ISOtech
1069        entityMap.put("empty", new Character('\u2205')); // empty set = null set = diameter, U+2205 ISOamso
1070        entityMap.put("nabla", new Character('\u2207')); // nabla = backward difference, U+2207 ISOtech
1071        entityMap.put("isin", new Character('\u2208')); // element of, U+2208 ISOtech
1072        entityMap.put("notin", new Character('\u2209')); // not an element of, U+2209 ISOtech
1073        entityMap.put("ni", new Character('\u220b')); // contains as member, U+220B ISOtech
1074        // should there be a more memorable name than 'ni'?
1075        entityMap.put("prod", new Character('\u220f')); // n-ary product = product sign, U+220F ISOamsb
1076        // prod is NOT the same character as U+03A0 'greek capital letter pi' though
1077        // the same glyph might be used for both
1078        entityMap.put("sum", new Character('\u2211')); // n-ary sumation, U+2211 ISOamsb
1079        // sum is NOT the same character as U+03A3 'greek capital letter sigma'
1080        // though the same glyph might be used for both
1081        entityMap.put("minus", new Character('\u2212')); // minus sign, U+2212 ISOtech
1082        entityMap.put("lowast", new Character('\u2217')); // asterisk operator, U+2217 ISOtech
1083        entityMap.put("radic", new Character('\u221a')); // square root = radical sign, U+221A ISOtech
1084        entityMap.put("prop", new Character('\u221d')); // proportional to, U+221D ISOtech
1085        entityMap.put("infin", new Character('\u221e')); // infinity, U+221E ISOtech
1086        entityMap.put("ang", new Character('\u2220')); // angle, U+2220 ISOamso
1087        entityMap.put("and", new Character('\u2227')); // logical and = wedge, U+2227 ISOtech
1088        entityMap.put("or", new Character('\u2228')); // logical or = vee, U+2228 ISOtech
1089        entityMap.put("cap", new Character('\u2229')); // intersection = cap, U+2229 ISOtech
1090        entityMap.put("cup", new Character('\u222a')); // union = cup, U+222A ISOtech
1091        entityMap.put("int", new Character('\u222b')); // integral, U+222B ISOtech
1092        entityMap.put("there4", new Character('\u2234')); // therefore, U+2234 ISOtech
1093        entityMap.put("sim", new Character('\u223c')); // tilde operator = varies with = similar to, U+223C ISOtech
1094        // tilde operator is NOT the same character as the tilde, U+007E,
1095        // although the same glyph might be used to represent both
1096        entityMap.put("cong", new Character('\u2245')); // approximately equal to, U+2245 ISOtech
1097        entityMap.put("asymp", new Character('\u2248')); // almost equal to = asymptotic to, U+2248 ISOamsr
1098        entityMap.put("ne", new Character('\u2260')); // not equal to, U+2260 ISOtech
1099        entityMap.put("equiv", new Character('\u2261')); // identical to, U+2261 ISOtech
1100        entityMap.put("le", new Character('\u2264')); // less-than or equal to, U+2264 ISOtech
1101        entityMap.put("ge", new Character('\u2265')); // greater-than or equal to, U+2265 ISOtech
1102        entityMap.put("sub", new Character('\u2282')); // subset of, U+2282 ISOtech
1103        entityMap.put("sup", new Character('\u2283')); // superset of, U+2283 ISOtech
1104        // note that nsup, 'not a superset of, U+2283' is not covered by the Symbol 
1105        // font encoding and is not included. Should it be, for symmetry?
1106        // It is in ISOamsn
1107        entityMap.put("nsub", new Character('\u2284')); // not a subset of, U+2284 ISOamsn
1108        entityMap.put("sube", new Character('\u2286')); // subset of or equal to, U+2286 ISOtech
1109        entityMap.put("supe", new Character('\u2287')); // superset of or equal to, U+2287 ISOtech
1110        entityMap.put("oplus", new Character('\u2295')); // circled plus = direct sum, U+2295 ISOamsb
1111        entityMap.put("otimes", new Character('\u2297')); // circled times = vector product, U+2297 ISOamsb
1112        entityMap.put("perp", new Character('\u22a5')); // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
1113        entityMap.put("sdot", new Character('\u22c5')); // dot operator, U+22C5 ISOamsb
1114        // dot operator is NOT the same character as U+00B7 middle dot
1115        // Miscellaneous Technical
1116        entityMap.put("lceil", new Character('\u2308')); // left ceiling = apl upstile, U+2308 ISOamsc
1117        entityMap.put("rceil", new Character('\u2309')); // right ceiling, U+2309 ISOamsc
1118        entityMap.put("lfloor", new Character('\u230a')); // left floor = apl downstile, U+230A ISOamsc
1119        entityMap.put("rfloor", new Character('\u230b')); // right floor, U+230B ISOamsc
1120        entityMap.put("lang", new Character('\u2329')); // left-pointing angle bracket = bra, U+2329 ISOtech
1121        // lang is NOT the same character as U+003C 'less than' 
1122        // or U+2039 'single left-pointing angle quotation mark'
1123        entityMap.put("rang", new Character('\u232a')); // right-pointing angle bracket = ket, U+232A ISOtech
1124        // rang is NOT the same character as U+003E 'greater than' 
1125        // or U+203A 'single right-pointing angle quotation mark'
1126        // Geometric Shapes
1127        entityMap.put("loz", new Character('\u25ca')); // lozenge, U+25CA ISOpub
1128        // Miscellaneous Symbols
1129        entityMap.put("spades", new Character('\u2660')); // black spade suit, U+2660 ISOpub
1130        // black here seems to mean filled as opposed to hollow
1131        entityMap.put("clubs", new Character('\u2663')); // black club suit = shamrock, U+2663 ISOpub
1132        entityMap.put("hearts", new Character('\u2665')); // black heart suit = valentine, U+2665 ISOpub
1133        entityMap.put("diams", new Character('\u2666')); // black diamond suit, U+2666 ISOpub
1134        // C0 Controls and Basic Latin
1135        entityMap.put("quot", new Character('\u0022')); // quotation mark = APL quote, U+0022 ISOnum
1136        entityMap.put("amp", new Character('\u0026')); // ampersand, U+0026 ISOnum
1137        entityMap.put("apos", new Character('\''));
1138        entityMap.put("lt", new Character('\u003c')); // less-than sign, U+003C ISOnum
1139        entityMap.put("gt", new Character('\u003e')); // greater-than sign, U+003E ISOnum
1140        // Latin Extended-A
1141        entityMap.put("OElig", new Character('\u0152')); // latin capital ligature OE, U+0152 ISOlat2
1142        entityMap.put("oelig", new Character('\u0153')); // latin small ligature oe, U+0153 ISOlat2
1143        // ligature is a misnomer, this is a separate character in some languages
1144        entityMap.put("Scaron", new Character('\u0160')); // latin capital letter S with caron, U+0160 ISOlat2
1145        entityMap.put("scaron", new Character('\u0161')); // latin small letter s with caron, U+0161 ISOlat2
1146        entityMap.put("Yuml", new Character('\u0178')); // latin capital letter Y with diaeresis, U+0178 ISOlat2
1147        // Spacing Modifier Letters
1148        entityMap.put("circ", new Character('\u02c6')); // modifier letter circumflex accent, U+02C6 ISOpub
1149        entityMap.put("tilde", new Character('\u02dc')); // small tilde, U+02DC ISOdia
1150        // General Punctuation
1151        entityMap.put("ensp", new Character('\u2002')); // en space, U+2002 ISOpub
1152        entityMap.put("emsp", new Character('\u2003')); // em space, U+2003 ISOpub
1153        entityMap.put("thinsp", new Character('\u2009')); // thin space, U+2009 ISOpub
1154        entityMap.put("zwnj", new Character('\u200c')); // zero width non-joiner, U+200C NEW RFC 2070
1155        entityMap.put("zwj", new Character('\u200d')); // zero width joiner, U+200D NEW RFC 2070
1156        entityMap.put("lrm", new Character('\u200e')); // left-to-right mark, U+200E NEW RFC 2070
1157        entityMap.put("rlm", new Character('\u200f')); // right-to-left mark, U+200F NEW RFC 2070
1158        entityMap.put("ndash", new Character('\u2013')); // en dash, U+2013 ISOpub
1159        entityMap.put("mdash", new Character('\u2014')); // em dash, U+2014 ISOpub
1160        entityMap.put("lsquo", new Character('\u2018')); // left single quotation mark, U+2018 ISOnum
1161        entityMap.put("rsquo", new Character('\u2019')); // right single quotation mark, U+2019 ISOnum
1162        entityMap.put("sbquo", new Character('\u201a')); // single low-9 quotation mark, U+201A NEW
1163        entityMap.put("ldquo", new Character('\u201c')); // left double quotation mark, U+201C ISOnum
1164        entityMap.put("rdquo", new Character('\u201d')); // right double quotation mark, U+201D ISOnum
1165        entityMap.put("bdquo", new Character('\u201e')); // double low-9 quotation mark, U+201E NEW
1166        entityMap.put("dagger", new Character('\u2020')); // dagger, U+2020 ISOpub
1167        entityMap.put("Dagger", new Character('\u2021')); // double dagger, U+2021 ISOpub
1168        entityMap.put("permil", new Character('\u2030')); // per mille sign, U+2030 ISOtech
1169        entityMap.put("lsaquo", new Character('\u2039')); // single left-pointing angle quotation mark, U+2039 ISO proposed
1170        // lsaquo is proposed but not yet ISO standardized
1171        entityMap.put("rsaquo", new Character('\u203a')); // single right-pointing angle quotation mark, U+203A ISO proposed
1172        // rsaquo is proposed but not yet ISO standardized
1173        entityMap.put("euro", new Character('\u20ac')); // euro sign, U+20AC NEW
1174    
1175    
1176    }
1177}
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Java Books Remove Frame
Popular Tags