KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > util > Translate


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Derrick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/Translate.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2004/07/31 16:42:33 $
10
// $Revision: 1.46 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.util;
28
29 import java.io.BufferedReader JavaDoc;
30 import java.io.BufferedWriter JavaDoc;
31 import java.io.IOException JavaDoc;
32 import java.io.InputStream JavaDoc;
33 import java.io.InputStreamReader JavaDoc;
34 import java.io.OutputStreamWriter JavaDoc;
35 import java.io.PrintStream JavaDoc;
36 import java.io.PrintWriter JavaDoc;
37 import java.io.Reader JavaDoc;
38 import java.io.UnsupportedEncodingException JavaDoc;
39
40 import org.htmlparser.util.sort.Sort;
41
42 /**
43  * Extended character entity reference.
44  * Handles kernels within other strings, just for lookup purposes.
45  */

46 class CharacterReferenceEx extends CharacterReference
47 {
48     /**
49      * The starting point in the string.
50      */

51     protected int mStart;
52
53     /**
54      * The ending point in the string.
55      */

56     protected int mEnd;
57
58     /**
59      * Zero args constructor.
60      * This object is only ever used after setting the kernel, start and end.
61      */

62     public CharacterReferenceEx ()
63     {
64         super ("", 0);
65     }
66
67     /**
68      * Set the starting point of the kernel.
69      */

70     public void setStart (int start)
71     {
72         mStart = start;
73     }
74
75     /**
76      * Set the supposed ending point.
77      * This only specifies an upper bound on the kernel length.
78      */

79     public void setEnd (int end)
80     {
81         mEnd = end;
82     }
83
84     /**
85      * Get this CharacterReference's kernel.
86      * @return The kernel in the equivalent character entity reference.
87      */

88     public String JavaDoc getKernel ()
89     {
90         return (mKernel.substring (mStart, mEnd));
91     }
92
93     //
94
// Ordered interface
95
//
96

97     /**
98      * Compare one reference to another.
99      * @see org.htmlparser.util.sort.Ordered
100      */

101     public int compare (Object JavaDoc that)
102     {
103         CharacterReference r;
104         String JavaDoc kernel;
105         int length;
106         int ret;
107
108         ret = 0;
109         r = (CharacterReference)that;
110         kernel = r.getKernel ();
111         length = kernel.length ();
112         for (int i = mStart, j = 0; i < mEnd; i++, j++)
113         {
114             if (j >= length)
115             {
116                 ret = 1;
117                 break;
118             }
119             ret = mKernel.charAt (i) - kernel.charAt (j);
120             if (0 != ret)
121                 break;
122         }
123
124         return (ret);
125     }
126 }
127
128 /**
129  * Translate numeric character references and character entity references to unicode characters.
130  * Based on tables found at <a HREF="http://www.w3.org/TR/REC-html40/sgml/entities.html">
131  * http://www.w3.org/TR/REC-html40/sgml/entities.html</a>
132  * <p>Typical usage:
133  * <pre>
134  * String s = Translate.decode (getTextFromHtmlPage ());
135  * </pre>
136  * or
137  * <pre>
138  * String s = "&lt;HTML&gt;" + Translate.encode (getArbitraryText ()) + "&lt;/HTML&gt;";
139  * </pre>
140  */

141 public class Translate
142 {
143     /**
144      * If this member is set <code>true</code>, decoding of streams is
145      * done line by line in order to reduce the maximum memory required.
146      */

147     static public boolean DECODE_LINE_BY_LINE = false;
148
149     /**
150      * If this member is set <code>true</code>, encoding of numeric character
151      * references uses hexadecimal digits, i.e. &amp;#x25CB;, instead of decimal
152      * digits.
153      */

154     static public boolean ENCODE_HEXADECIMAL = false;
155
156     /**
157      * Table mapping entity reference kernel to character.
158      * This is sorted by kernel when the class is loaded.
159      */

160     protected static final CharacterReference[] mCharacterReferences =
161     {
162         // Portions © International Organization for Standardization 1986
163
// Permission to copy in any form is granted for use with
164
// conforming SGML systems and applications as defined in
165
// ISO 8879, provided this notice is included in all copies.
166
// Character entity set. Typical invocation:
167
// <!ENTITY % HTMLlat1 PUBLIC
168
// "-//W3C//ENTITIES Latin 1//EN//HTML">
169
// %HTMLlat1;
170
new CharacterReference ("nbsp", '\u00a0'), // no-break space = non-breaking space, U+00A0 ISOnum
171
new CharacterReference ("iexcl", '\u00a1'), // inverted exclamation mark, U+00A1 ISOnum
172
new CharacterReference ("cent", '\u00a2'), // cent sign, U+00A2 ISOnum
173
new CharacterReference ("pound", '\u00a3'), // pound sign, U+00A3 ISOnum
174
new CharacterReference ("curren", '\u00a4'), // currency sign, U+00A4 ISOnum
175
new CharacterReference ("yen", '\u00a5'), // yen sign = yuan sign, U+00A5 ISOnum
176
new CharacterReference ("brvbar", '\u00a6'), // broken bar = broken vertical bar, U+00A6 ISOnum
177
new CharacterReference ("sect", '\u00a7'), // section sign, U+00A7 ISOnum
178
new CharacterReference ("uml", '\u00a8'), // diaeresis = spacing diaeresis, U+00A8 ISOdia
179
new CharacterReference ("copy", '\u00a9'), // copyright sign, U+00A9 ISOnum
180
new CharacterReference ("ordf", '\u00aa'), // feminine ordinal indicator, U+00AA ISOnum
181
new CharacterReference ("laquo", '\u00ab'), // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
182
new CharacterReference ("not", '\u00ac'), // not sign, U+00AC ISOnum
183
new CharacterReference ("shy", '\u00ad'), // soft hyphen = discretionary hyphen, U+00AD ISOnum
184
new CharacterReference ("reg", '\u00ae'), // registered sign = registered trade mark sign, U+00AE ISOnum
185
new CharacterReference ("macr", '\u00af'), // macron = spacing macron = overline = APL overbar, U+00AF ISOdia
186
new CharacterReference ("deg", '\u00b0'), // degree sign, U+00B0 ISOnum
187
new CharacterReference ("plusmn", '\u00b1'), // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
188
new CharacterReference ("sup2", '\u00b2'), // superscript two = superscript digit two = squared, U+00B2 ISOnum
189
new CharacterReference ("sup3", '\u00b3'), // superscript three = superscript digit three = cubed, U+00B3 ISOnum
190
new CharacterReference ("acute", '\u00b4'), // acute accent = spacing acute, U+00B4 ISOdia
191
new CharacterReference ("micro", '\u00b5'), // micro sign, U+00B5 ISOnum
192
new CharacterReference ("para", '\u00b6'), // pilcrow sign = paragraph sign, U+00B6 ISOnum
193
new CharacterReference ("middot", '\u00b7'), // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
194
new CharacterReference ("cedil", '\u00b8'), // cedilla = spacing cedilla, U+00B8 ISOdia
195
new CharacterReference ("sup1", '\u00b9'), // superscript one = superscript digit one, U+00B9 ISOnum
196
new CharacterReference ("ordm", '\u00ba'), // masculine ordinal indicator, U+00BA ISOnum
197
new CharacterReference ("raquo", '\u00bb'), // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
198
new CharacterReference ("frac14", '\u00bc'), // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
199
new CharacterReference ("frac12", '\u00bd'), // vulgar fraction one half = fraction one half, U+00BD ISOnum
200
new CharacterReference ("frac34", '\u00be'), // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
201
new CharacterReference ("iquest", '\u00bf'), // inverted question mark = turned question mark, U+00BF ISOnum
202
new CharacterReference ("Agrave", '\u00c0'), // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
203
new CharacterReference ("Aacute", '\u00c1'), // latin capital letter A with acute, U+00C1 ISOlat1
204
new CharacterReference ("Acirc", '\u00c2'), // latin capital letter A with circumflex, U+00C2 ISOlat1
205
new CharacterReference ("Atilde", '\u00c3'), // latin capital letter A with tilde, U+00C3 ISOlat1
206
new CharacterReference ("Auml", '\u00c4'), // latin capital letter A with diaeresis, U+00C4 ISOlat1
207
new CharacterReference ("Aring", '\u00c5'), // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1
208
new CharacterReference ("AElig", '\u00c6'), // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
209
new CharacterReference ("Ccedil", '\u00c7'), // latin capital letter C with cedilla, U+00C7 ISOlat1
210
new CharacterReference ("Egrave", '\u00c8'), // latin capital letter E with grave, U+00C8 ISOlat1
211
new CharacterReference ("Eacute", '\u00c9'), // latin capital letter E with acute, U+00C9 ISOlat1
212
new CharacterReference ("Ecirc", '\u00ca'), // latin capital letter E with circumflex, U+00CA ISOlat1
213
new CharacterReference ("Euml", '\u00cb'), // latin capital letter E with diaeresis, U+00CB ISOlat1
214
new CharacterReference ("Igrave", '\u00cc'), // latin capital letter I with grave, U+00CC ISOlat1
215
new CharacterReference ("Iacute", '\u00cd'), // latin capital letter I with acute, U+00CD ISOlat1
216
new CharacterReference ("Icirc", '\u00ce'), // latin capital letter I with circumflex, U+00CE ISOlat1
217
new CharacterReference ("Iuml", '\u00cf'), // latin capital letter I with diaeresis, U+00CF ISOlat1
218
new CharacterReference ("ETH", '\u00d0'), // latin capital letter ETH, U+00D0 ISOlat1
219
new CharacterReference ("Ntilde", '\u00d1'), // latin capital letter N with tilde, U+00D1 ISOlat1
220
new CharacterReference ("Ograve", '\u00d2'), // latin capital letter O with grave, U+00D2 ISOlat1
221
new CharacterReference ("Oacute", '\u00d3'), // latin capital letter O with acute, U+00D3 ISOlat1
222
new CharacterReference ("Ocirc", '\u00d4'), // latin capital letter O with circumflex, U+00D4 ISOlat1
223
new CharacterReference ("Otilde", '\u00d5'), // latin capital letter O with tilde, U+00D5 ISOlat1
224
new CharacterReference ("Ouml", '\u00d6'), // latin capital letter O with diaeresis, U+00D6 ISOlat1
225
new CharacterReference ("times", '\u00d7'), // multiplication sign, U+00D7 ISOnum
226
new CharacterReference ("Oslash", '\u00d8'), // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
227
new CharacterReference ("Ugrave", '\u00d9'), // latin capital letter U with grave, U+00D9 ISOlat1
228
new CharacterReference ("Uacute", '\u00da'), // latin capital letter U with acute, U+00DA ISOlat1
229
new CharacterReference ("Ucirc", '\u00db'), // latin capital letter U with circumflex, U+00DB ISOlat1
230
new CharacterReference ("Uuml", '\u00dc'), // latin capital letter U with diaeresis, U+00DC ISOlat1
231
new CharacterReference ("Yacute", '\u00dd'), // latin capital letter Y with acute, U+00DD ISOlat1
232
new CharacterReference ("THORN", '\u00de'), // latin capital letter THORN, U+00DE ISOlat1
233
new CharacterReference ("szlig", '\u00df'), // latin small letter sharp s = ess-zed, U+00DF ISOlat1
234
new CharacterReference ("agrave", '\u00e0'), // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
235
new CharacterReference ("aacute", '\u00e1'), // latin small letter a with acute, U+00E1 ISOlat1
236
new CharacterReference ("acirc", '\u00e2'), // latin small letter a with circumflex, U+00E2 ISOlat1
237
new CharacterReference ("atilde", '\u00e3'), // latin small letter a with tilde, U+00E3 ISOlat1
238
new CharacterReference ("auml", '\u00e4'), // latin small letter a with diaeresis, U+00E4 ISOlat1
239
new CharacterReference ("aring", '\u00e5'), // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1
240
new CharacterReference ("aelig", '\u00e6'), // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
241
new CharacterReference ("ccedil", '\u00e7'), // latin small letter c with cedilla, U+00E7 ISOlat1
242
new CharacterReference ("egrave", '\u00e8'), // latin small letter e with grave, U+00E8 ISOlat1
243
new CharacterReference ("eacute", '\u00e9'), // latin small letter e with acute, U+00E9 ISOlat1
244
new CharacterReference ("ecirc", '\u00ea'), // latin small letter e with circumflex, U+00EA ISOlat1
245
new CharacterReference ("euml", '\u00eb'), // latin small letter e with diaeresis, U+00EB ISOlat1
246
new CharacterReference ("igrave", '\u00ec'), // latin small letter i with grave, U+00EC ISOlat1
247
new CharacterReference ("iacute", '\u00ed'), // latin small letter i with acute, U+00ED ISOlat1
248
new CharacterReference ("icirc", '\u00ee'), // latin small letter i with circumflex, U+00EE ISOlat1
249
new CharacterReference ("iuml", '\u00ef'), // latin small letter i with diaeresis, U+00EF ISOlat1
250
new CharacterReference ("eth", '\u00f0'), // latin small letter eth, U+00F0 ISOlat1
251
new CharacterReference ("ntilde", '\u00f1'), // latin small letter n with tilde, U+00F1 ISOlat1
252
new CharacterReference ("ograve", '\u00f2'), // latin small letter o with grave, U+00F2 ISOlat1
253
new CharacterReference ("oacute", '\u00f3'), // latin small letter o with acute, U+00F3 ISOlat1
254
new CharacterReference ("ocirc", '\u00f4'), // latin small letter o with circumflex, U+00F4 ISOlat1
255
new CharacterReference ("otilde", '\u00f5'), // latin small letter o with tilde, U+00F5 ISOlat1
256
new CharacterReference ("ouml", '\u00f6'), // latin small letter o with diaeresis, U+00F6 ISOlat1
257
new CharacterReference ("divide", '\u00f7'), // division sign, U+00F7 ISOnum
258
new CharacterReference ("oslash", '\u00f8'), // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
259
new CharacterReference ("ugrave", '\u00f9'), // latin small letter u with grave, U+00F9 ISOlat1
260
new CharacterReference ("uacute", '\u00fa'), // latin small letter u with acute, U+00FA ISOlat1
261
new CharacterReference ("ucirc", '\u00fb'), // latin small letter u with circumflex, U+00FB ISOlat1
262
new CharacterReference ("uuml", '\u00fc'), // latin small letter u with diaeresis, U+00FC ISOlat1
263
new CharacterReference ("yacute", '\u00fd'), // latin small letter y with acute, U+00FD ISOlat1
264
new CharacterReference ("thorn", '\u00fe'), // latin small letter thorn, U+00FE ISOlat1
265
new CharacterReference ("yuml", '\u00ff'), // latin small letter y with diaeresis, U+00FF ISOlat1
266
// Mathematical, Greek and Symbolic characters for HTML
267
// Character entity set. Typical invocation:
268
// <!ENTITY % HTMLsymbol PUBLIC
269
// "-//W3C//ENTITIES Symbols//EN//HTML">
270
// %HTMLsymbol;
271
// Portions © International Organization for Standardization 1986:
272
// Permission to copy in any form is granted for use with
273
// conforming SGML systems and applications as defined in
274
// ISO 8879, provided this notice is included in all copies.
275
// Relevant ISO entity set is given unless names are newly introduced.
276
// New names (i.e., not in ISO 8879 list) do not clash with any
277
// existing ISO 8879 entity names. ISO 10646 character numbers
278
// are given for each character, in hex. CDATA values are decimal
279
// conversions of the ISO 10646 values and refer to the document
280
// character set. Names are ISO 10646 names.
281
// Latin Extended-B
282
new CharacterReference ("fnof", '\u0192'), // latin small f with hook = function = florin, U+0192 ISOtech
283
// Greek
284
new CharacterReference ("Alpha", '\u0391'), // greek capital letter alpha, U+0391
285
new CharacterReference ("Beta", '\u0392'), // greek capital letter beta, U+0392
286
new CharacterReference ("Gamma", '\u0393'), // greek capital letter gamma, U+0393 ISOgrk3
287
new CharacterReference ("Delta", '\u0394'), // greek capital letter delta, U+0394 ISOgrk3
288
new CharacterReference ("Epsilon", '\u0395'), // greek capital letter epsilon, U+0395
289
new CharacterReference ("Zeta", '\u0396'), // greek capital letter zeta, U+0396
290
new CharacterReference ("Eta", '\u0397'), // greek capital letter eta, U+0397
291
new CharacterReference ("Theta", '\u0398'), // greek capital letter theta, U+0398 ISOgrk3
292
new CharacterReference ("Iota", '\u0399'), // greek capital letter iota, U+0399
293
new CharacterReference ("Kappa", '\u039a'), // greek capital letter kappa, U+039A
294
new CharacterReference ("Lambda", '\u039b'), // greek capital letter lambda, U+039B ISOgrk3
295
new CharacterReference ("Mu", '\u039c'), // greek capital letter mu, U+039C
296
new CharacterReference ("Nu", '\u039d'), // greek capital letter nu, U+039D
297
new CharacterReference ("Xi", '\u039e'), // greek capital letter xi, U+039E ISOgrk3
298
new CharacterReference ("Omicron", '\u039f'), // greek capital letter omicron, U+039F
299
new CharacterReference ("Pi", '\u03a0'), // greek capital letter pi, U+03A0 ISOgrk3
300
new CharacterReference ("Rho", '\u03a1'), // greek capital letter rho, U+03A1
301
// there is no Sigmaf, and no U+03A2 character either
302
new CharacterReference ("Sigma", '\u03a3'), // greek capital letter sigma, U+03A3 ISOgrk3
303
new CharacterReference ("Tau", '\u03a4'), // greek capital letter tau, U+03A4
304
new CharacterReference ("Upsilon", '\u03a5'), // greek capital letter upsilon, U+03A5 ISOgrk3
305
new CharacterReference ("Phi", '\u03a6'), // greek capital letter phi, U+03A6 ISOgrk3
306
new CharacterReference ("Chi", '\u03a7'), // greek capital letter chi, U+03A7
307
new CharacterReference ("Psi", '\u03a8'), // greek capital letter psi, U+03A8 ISOgrk3
308
new CharacterReference ("Omega", '\u03a9'), // greek capital letter omega, U+03A9 ISOgrk3
309
new CharacterReference ("alpha", '\u03b1'), // greek small letter alpha, U+03B1 ISOgrk3
310
new CharacterReference ("beta", '\u03b2'), // greek small letter beta, U+03B2 ISOgrk3
311
new CharacterReference ("gamma", '\u03b3'), // greek small letter gamma, U+03B3 ISOgrk3
312
new CharacterReference ("delta", '\u03b4'), // greek small letter delta, U+03B4 ISOgrk3
313
new CharacterReference ("epsilon", '\u03b5'), // greek small letter epsilon, U+03B5 ISOgrk3
314
new CharacterReference ("zeta", '\u03b6'), // greek small letter zeta, U+03B6 ISOgrk3
315
new CharacterReference ("eta", '\u03b7'), // greek small letter eta, U+03B7 ISOgrk3
316
new CharacterReference ("theta", '\u03b8'), // greek small letter theta, U+03B8 ISOgrk3
317
new CharacterReference ("iota", '\u03b9'), // greek small letter iota, U+03B9 ISOgrk3
318
new CharacterReference ("kappa", '\u03ba'), // greek small letter kappa, U+03BA ISOgrk3
319
new CharacterReference ("lambda", '\u03bb'), // greek small letter lambda, U+03BB ISOgrk3
320
new CharacterReference ("mu", '\u03bc'), // greek small letter mu, U+03BC ISOgrk3
321
new CharacterReference ("nu", '\u03bd'), // greek small letter nu, U+03BD ISOgrk3
322
new CharacterReference ("xi", '\u03be'), // greek small letter xi, U+03BE ISOgrk3
323
new CharacterReference ("omicron", '\u03bf'), // greek small letter omicron, U+03BF NEW
324
new CharacterReference ("pi", '\u03c0'), // greek small letter pi, U+03C0 ISOgrk3
325
new CharacterReference ("rho", '\u03c1'), // greek small letter rho, U+03C1 ISOgrk3
326
new CharacterReference ("sigmaf", '\u03c2'), // greek small letter final sigma, U+03C2 ISOgrk3
327
new CharacterReference ("sigma", '\u03c3'), // greek small letter sigma, U+03C3 ISOgrk3
328
new CharacterReference ("tau", '\u03c4'), // greek small letter tau, U+03C4 ISOgrk3
329
new CharacterReference ("upsilon", '\u03c5'), // greek small letter upsilon, U+03C5 ISOgrk3
330
new CharacterReference ("phi", '\u03c6'), // greek small letter phi, U+03C6 ISOgrk3
331
new CharacterReference ("chi", '\u03c7'), // greek small letter chi, U+03C7 ISOgrk3
332
new CharacterReference ("psi", '\u03c8'), // greek small letter psi, U+03C8 ISOgrk3
333
new CharacterReference ("omega", '\u03c9'), // greek small letter omega, U+03C9 ISOgrk3
334
new CharacterReference ("thetasym", '\u03d1'), // greek small letter theta symbol, U+03D1 NEW
335
new CharacterReference ("upsih", '\u03d2'), // greek upsilon with hook symbol, U+03D2 NEW
336
new CharacterReference ("piv", '\u03d6'), // greek pi symbol, U+03D6 ISOgrk3
337
// General Punctuation
338
new CharacterReference ("bull", '\u2022'), // bullet = black small circle, U+2022 ISOpub
339
// bullet is NOT the same as bullet operator, U+2219
340
new CharacterReference ("hellip", '\u2026'), // horizontal ellipsis = three dot leader, U+2026 ISOpub
341
new CharacterReference ("prime", '\u2032'), // prime = minutes = feet, U+2032 ISOtech
342
new CharacterReference ("Prime", '\u2033'), // double prime = seconds = inches, U+2033 ISOtech
343
new CharacterReference ("oline", '\u203e'), // overline = spacing overscore, U+203E NEW
344
new CharacterReference ("frasl", '\u2044'), // fraction slash, U+2044 NEW
345
// Letterlike Symbols
346
new CharacterReference ("weierp", '\u2118'), // script capital P = power set = Weierstrass p, U+2118 ISOamso
347
new CharacterReference ("image", '\u2111'), // blackletter capital I = imaginary part, U+2111 ISOamso
348
new CharacterReference ("real", '\u211c'), // blackletter capital R = real part symbol, U+211C ISOamso
349
new CharacterReference ("trade", '\u2122'), // trade mark sign, U+2122 ISOnum
350
new CharacterReference ("alefsym", '\u2135'), // alef symbol = first transfinite cardinal, U+2135 NEW
351
// alef symbol is NOT the same as hebrew letter alef,
352
// U+05D0 although the same glyph could be used to depict both characters
353
// Arrows
354
new CharacterReference ("larr", '\u2190'), // leftwards arrow, U+2190 ISOnum
355
new CharacterReference ("uarr", '\u2191'), // upwards arrow, U+2191 ISOnum
356
new CharacterReference ("rarr", '\u2192'), // rightwards arrow, U+2192 ISOnum
357
new CharacterReference ("darr", '\u2193'), // downwards arrow, U+2193 ISOnum
358
new CharacterReference ("harr", '\u2194'), // left right arrow, U+2194 ISOamsa
359
new CharacterReference ("crarr", '\u21b5'), // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
360
new CharacterReference ("lArr", '\u21d0'), // leftwards double arrow, U+21D0 ISOtech
361
// ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
362
// but also does not have any other character for that function. So ? lArr can
363
// be used for 'is implied by' as ISOtech suggests
364
new CharacterReference ("uArr", '\u21d1'), // upwards double arrow, U+21D1 ISOamsa
365
new CharacterReference ("rArr", '\u21d2'), // rightwards double arrow, U+21D2 ISOtech
366
// ISO 10646 does not say this is the 'implies' character but does not have
367
// another character with this function so ?
368
// rArr can be used for 'implies' as ISOtech suggests
369
new CharacterReference ("dArr", '\u21d3'), // downwards double arrow, U+21D3 ISOamsa
370
new CharacterReference ("hArr", '\u21d4'), // left right double arrow, U+21D4 ISOamsa
371
// Mathematical Operators
372
new CharacterReference ("forall", '\u2200'), // for all, U+2200 ISOtech
373
new CharacterReference ("part", '\u2202'), // partial differential, U+2202 ISOtech
374
new CharacterReference ("exist", '\u2203'), // there exists, U+2203 ISOtech
375
new CharacterReference ("empty", '\u2205'), // empty set = null set = diameter, U+2205 ISOamso
376
new CharacterReference ("nabla", '\u2207'), // nabla = backward difference, U+2207 ISOtech
377
new CharacterReference ("isin", '\u2208'), // element of, U+2208 ISOtech
378
new CharacterReference ("notin", '\u2209'), // not an element of, U+2209 ISOtech
379
new CharacterReference ("ni", '\u220b'), // contains as member, U+220B ISOtech
380
// should there be a more memorable name than 'ni'?
381
new CharacterReference ("prod", '\u220f'), // n-ary product = product sign, U+220F ISOamsb
382
// prod is NOT the same character as U+03A0 'greek capital letter pi' though
383
// the same glyph might be used for both
384
new CharacterReference ("sum", '\u2211'), // n-ary sumation, U+2211 ISOamsb
385
// sum is NOT the same character as U+03A3 'greek capital letter sigma'
386
// though the same glyph might be used for both
387
new CharacterReference ("minus", '\u2212'), // minus sign, U+2212 ISOtech
388
new CharacterReference ("lowast", '\u2217'), // asterisk operator, U+2217 ISOtech
389
new CharacterReference ("radic", '\u221a'), // square root = radical sign, U+221A ISOtech
390
new CharacterReference ("prop", '\u221d'), // proportional to, U+221D ISOtech
391
new CharacterReference ("infin", '\u221e'), // infinity, U+221E ISOtech
392
new CharacterReference ("ang", '\u2220'), // angle, U+2220 ISOamso
393
new CharacterReference ("and", '\u2227'), // logical and = wedge, U+2227 ISOtech
394
new CharacterReference ("or", '\u2228'), // logical or = vee, U+2228 ISOtech
395
new CharacterReference ("cap", '\u2229'), // intersection = cap, U+2229 ISOtech
396
new CharacterReference ("cup", '\u222a'), // union = cup, U+222A ISOtech
397
new CharacterReference ("int", '\u222b'), // integral, U+222B ISOtech
398
new CharacterReference ("there4", '\u2234'), // therefore, U+2234 ISOtech
399
new CharacterReference ("sim", '\u223c'), // tilde operator = varies with = similar to, U+223C ISOtech
400
// tilde operator is NOT the same character as the tilde, U+007E,
401
// although the same glyph might be used to represent both
402
new CharacterReference ("cong", '\u2245'), // approximately equal to, U+2245 ISOtech
403
new CharacterReference ("asymp", '\u2248'), // almost equal to = asymptotic to, U+2248 ISOamsr
404
new CharacterReference ("ne", '\u2260'), // not equal to, U+2260 ISOtech
405
new CharacterReference ("equiv", '\u2261'), // identical to, U+2261 ISOtech
406
new CharacterReference ("le", '\u2264'), // less-than or equal to, U+2264 ISOtech
407
new CharacterReference ("ge", '\u2265'), // greater-than or equal to, U+2265 ISOtech
408
new CharacterReference ("sub", '\u2282'), // subset of, U+2282 ISOtech
409
new CharacterReference ("sup", '\u2283'), // superset of, U+2283 ISOtech
410
// note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
411
// font encoding and is not included. Should it be, for symmetry?
412
// It is in ISOamsn
413
new CharacterReference ("nsub", '\u2284'), // not a subset of, U+2284 ISOamsn
414
new CharacterReference ("sube", '\u2286'), // subset of or equal to, U+2286 ISOtech
415
new CharacterReference ("supe", '\u2287'), // superset of or equal to, U+2287 ISOtech
416
new CharacterReference ("oplus", '\u2295'), // circled plus = direct sum, U+2295 ISOamsb
417
new CharacterReference ("otimes", '\u2297'), // circled times = vector product, U+2297 ISOamsb
418
new CharacterReference ("perp", '\u22a5'), // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
419
new CharacterReference ("sdot", '\u22c5'), // dot operator, U+22C5 ISOamsb
420
// dot operator is NOT the same character as U+00B7 middle dot
421
// Miscellaneous Technical
422
new CharacterReference ("lceil", '\u2308'), // left ceiling = apl upstile, U+2308 ISOamsc
423
new CharacterReference ("rceil", '\u2309'), // right ceiling, U+2309 ISOamsc
424
new CharacterReference ("lfloor", '\u230a'), // left floor = apl downstile, U+230A ISOamsc
425
new CharacterReference ("rfloor", '\u230b'), // right floor, U+230B ISOamsc
426
new CharacterReference ("lang", '\u2329'), // left-pointing angle bracket = bra, U+2329 ISOtech
427
// lang is NOT the same character as U+003C 'less than'
428
// or U+2039 'single left-pointing angle quotation mark'
429
new CharacterReference ("rang", '\u232a'), // right-pointing angle bracket = ket, U+232A ISOtech
430
// rang is NOT the same character as U+003E 'greater than'
431
// or U+203A 'single right-pointing angle quotation mark'
432
// Geometric Shapes
433
new CharacterReference ("loz", '\u25ca'), // lozenge, U+25CA ISOpub
434
// Miscellaneous Symbols
435
new CharacterReference ("spades", '\u2660'), // black spade suit, U+2660 ISOpub
436
// black here seems to mean filled as opposed to hollow
437
new CharacterReference ("clubs", '\u2663'), // black club suit = shamrock, U+2663 ISOpub
438
new CharacterReference ("hearts", '\u2665'), // black heart suit = valentine, U+2665 ISOpub
439
new CharacterReference ("diams", '\u2666'), // black diamond suit, U+2666 ISOpub
440
// Special characters for HTML
441
// Character entity set. Typical invocation:
442
// <!ENTITY % HTMLspecial PUBLIC
443
// "-//W3C//ENTITIES Special//EN//HTML">
444
// %HTMLspecial;
445
// Portions © International Organization for Standardization 1986:
446
// Permission to copy in any form is granted for use with
447
// conforming SGML systems and applications as defined in
448
// ISO 8879, provided this notice is included in all copies.
449
// Relevant ISO entity set is given unless names are newly introduced.
450
// New names (i.e., not in ISO 8879 list) do not clash with any
451
// existing ISO 8879 entity names. ISO 10646 character numbers
452
// are given for each character, in hex. CDATA values are decimal
453
// conversions of the ISO 10646 values and refer to the document
454
// character set. Names are ISO 10646 names.
455
// C0 Controls and Basic Latin
456
new CharacterReference ("quot", '\u0022'), // quotation mark = APL quote, U+0022 ISOnum
457
new CharacterReference ("amp", '\u0026'), // ampersand, U+0026 ISOnum
458
new CharacterReference ("lt", '\u003c'), // less-than sign, U+003C ISOnum
459
new CharacterReference ("gt", '\u003e'), // greater-than sign, U+003E ISOnum
460
// Latin Extended-A
461
new CharacterReference ("OElig", '\u0152'), // latin capital ligature OE, U+0152 ISOlat2
462
new CharacterReference ("oelig", '\u0153'), // latin small ligature oe, U+0153 ISOlat2
463
// ligature is a misnomer, this is a separate character in some languages
464
new CharacterReference ("Scaron", '\u0160'), // latin capital letter S with caron, U+0160 ISOlat2
465
new CharacterReference ("scaron", '\u0161'), // latin small letter s with caron, U+0161 ISOlat2
466
new CharacterReference ("Yuml", '\u0178'), // latin capital letter Y with diaeresis, U+0178 ISOlat2
467
// Spacing Modifier Letters
468
new CharacterReference ("circ", '\u02c6'), // modifier letter circumflex accent, U+02C6 ISOpub
469
new CharacterReference ("tilde", '\u02dc'), // small tilde, U+02DC ISOdia
470
// General Punctuation
471
new CharacterReference ("ensp", '\u2002'), // en space, U+2002 ISOpub
472
new CharacterReference ("emsp", '\u2003'), // em space, U+2003 ISOpub
473
new CharacterReference ("thinsp", '\u2009'), // thin space, U+2009 ISOpub
474
new CharacterReference ("zwnj", '\u200c'), // zero width non-joiner, U+200C NEW RFC 2070
475
new CharacterReference ("zwj", '\u200d'), // zero width joiner, U+200D NEW RFC 2070
476
new CharacterReference ("lrm", '\u200e'), // left-to-right mark, U+200E NEW RFC 2070
477
new CharacterReference ("rlm", '\u200f'), // right-to-left mark, U+200F NEW RFC 2070
478
new CharacterReference ("ndash", '\u2013'), // en dash, U+2013 ISOpub
479
new CharacterReference ("mdash", '\u2014'), // em dash, U+2014 ISOpub
480
new CharacterReference ("lsquo", '\u2018'), // left single quotation mark, U+2018 ISOnum
481
new CharacterReference ("rsquo", '\u2019'), // right single quotation mark, U+2019 ISOnum
482
new CharacterReference ("sbquo", '\u201a'), // single low-9 quotation mark, U+201A NEW
483
new CharacterReference ("ldquo", '\u201c'), // left double quotation mark, U+201C ISOnum
484
new CharacterReference ("rdquo", '\u201d'), // right double quotation mark, U+201D ISOnum
485
new CharacterReference ("bdquo", '\u201e'), // double low-9 quotation mark, U+201E NEW
486
new CharacterReference ("dagger", '\u2020'), // dagger, U+2020 ISOpub
487
new CharacterReference ("Dagger", '\u2021'), // double dagger, U+2021 ISOpub
488
new CharacterReference ("permil", '\u2030'), // per mille sign, U+2030 ISOtech
489
new CharacterReference ("lsaquo", '\u2039'), // single left-pointing angle quotation mark, U+2039 ISO proposed
490
// lsaquo is proposed but not yet ISO standardized
491
new CharacterReference ("rsaquo", '\u203a'), // single right-pointing angle quotation mark, U+203A ISO proposed
492
// rsaquo is proposed but not yet ISO standardized
493
new CharacterReference ("euro", '\u20ac'), // euro sign, U+20AC NEW
494
};
495
496     /**
497      * The dividing point between a simple table lookup and a binary search.
498      * Characters below the break point are stored in a sparse array allowing
499      * direct index lookup.
500      */

501     protected static final int BREAKPOINT = 0x100;
502
503     /**
504      * List of references sorted by character.
505      * The first part of this array, up to <code>BREAKPOINT</code> is stored
506      * in a direct translational table, indexing into the table with a character
507      * yields the reference. The second part is dense and sorted by character,
508      * suitable for binary lookup.
509      */

510     protected static final CharacterReference[] mCharacterList;
511
512     static
513     {
514         int index;
515         CharacterReference item;
516         int character;
517
518         // count below the break point
519
index = 0;
520         for (int i = 0; i < mCharacterReferences.length; i++)
521             if (mCharacterReferences[i].getCharacter () < BREAKPOINT)
522                 index++;
523         // allocate enough for the linear table and remainder
524
mCharacterList = new CharacterReference[BREAKPOINT + mCharacterReferences.length - index];
525         index = BREAKPOINT;
526         for (int i = 0; i < mCharacterReferences.length; i++)
527         {
528             item = mCharacterReferences[i];
529             character = mCharacterReferences[i].getCharacter ();
530             if (character < BREAKPOINT)
531                 mCharacterList[character] = item;
532             else
533             {
534                 // use a linear search and insertion sort, done only once
535
int x = BREAKPOINT;
536                 while (x < index)
537                     if (mCharacterList[x].getCharacter () > character)
538                         break;
539                     else
540                         x++;
541                 int y = index - 1;
542                 while (y >= x)
543                 {
544                     mCharacterList[y + 1] = mCharacterList[y];
545                     y--;
546                 }
547                 mCharacterList[x] = item;
548                 index++;
549             }
550         }
551         // reorder the original array into kernel order
552
Sort.QuickSort (mCharacterReferences);
553     }
554
555     /**
556      * Private constructor.
557      * This class is fully static and thread safe.
558      */

559     private Translate ()
560     {
561     }
562
563     /**
564      * Binary search for a reference.
565      * @param array The array of <code>CharacterReference</code> objects.
566      * @param ref The character to search for.
567      * @param lo The lower index within which to look.
568      * @param hi The upper index within which to look.
569      * @return The index at which reference was found or is to be inserted.
570      */

571     protected static int lookup (CharacterReference[] array, char ref, int lo, int hi)
572     { int num;
573         int mid;
574         int half;
575         int result;
576         int ret;
577
578         ret = -1;
579
580         num = (hi - lo) + 1;
581         while ((-1 == ret) && (lo <= hi))
582         {
583             half = num / 2;
584             mid = lo + ((0 != (num & 1)) ? half : half - 1);
585             result = ref - array[mid].getCharacter ();
586             if (0 == result)
587                 ret = mid;
588             else if (0 > result)
589             {
590                 hi = mid - 1;
591                 num = ((0 != (num & 1)) ? half : half - 1);
592             }
593             else
594             {
595                 lo = mid + 1;
596                 num = half;
597             }
598         }
599         if (-1 == ret)
600             ret = lo;
601
602         return (ret);
603     }
604
605     /**
606      * Look up a reference by character.
607      * Use a combination of direct table lookup and binary search to find
608      * the reference corresponding to the character.
609      * @param character The character to be looked up.
610      * @return The entity reference for that character or <code>null</code>.
611      */

612     public static CharacterReference lookup (char character)
613     {
614         int index;
615         CharacterReference ret;
616
617         if (character < BREAKPOINT)
618             ret = mCharacterList[character];
619         else
620         {
621             index = lookup (mCharacterList, character, BREAKPOINT, mCharacterList.length - 1);
622             if (index < mCharacterList.length)
623             {
624                 ret = mCharacterList[index];
625                 if (character != ret.getCharacter ())
626                     ret = null;
627             }
628             else
629                 ret = null;
630         }
631         
632         return (ret);
633     }
634
635     /**
636      * Look up a reference by kernel.
637      * Use a binary search on the ordered list of known references.
638      * Since the binary search returns the position at which a new item should
639      * be inserted, we check the references earlier in the list if there is
640      * a failure.
641      * @param key A character reference with the kernel set to the string
642      * to be found. It need not be truncated at the exact end of the reference.
643      */

644     protected static CharacterReference lookup (CharacterReference key)
645     {
646         String JavaDoc string;
647         int index;
648         String JavaDoc kernel;
649         char character;
650         CharacterReference test;
651         CharacterReference ret;
652
653         // Care should be taken here because some entity references are
654
// prefixes of others, i.e.:
655
// \u2209[notin] \u00ac[not]
656
// \u00ba[ordm] \u2228[or]
657
// \u03d6[piv] \u03c0[pi]
658
// \u00b3[sup3] \u2283[sup]
659
ret = null;
660         index = Sort.bsearch (mCharacterReferences, key);
661         string = key.getKernel ();
662         if (index < mCharacterReferences.length)
663         {
664             ret = mCharacterReferences[index];
665             kernel = ret.getKernel ();
666             if (!string.regionMatches (
667                 0,
668                 kernel,
669                 0,
670                 kernel.length ()))
671             { // not exact, check references starting with same character
672
// to see if a subset matches
673
ret = null;
674             }
675         }
676         if (null == ret)
677         {
678             character = string.charAt (0);
679             while (--index >= 0)
680             {
681                 test = mCharacterReferences[index];
682                 kernel = test.getKernel ();
683                 if (character == kernel.charAt (0))
684                 {
685                     if (string.regionMatches (
686                         0,
687                         kernel,
688                         0,
689                         kernel.length ()))
690                     {
691                         ret = test;
692                         break;
693                     }
694                 }
695                 else
696                     break;
697             }
698         }
699         
700         return (ret);
701     }
702
703     /**
704      * Look up a reference by kernel.
705      * Use a binary search on the ordered list of known references.
706      * <em>This is not very efficient, use {@link org.htmlparser.util.Translate#lookup(org.htmlparser.util.CharacterReference) lookup(CharacterReference)}
707      * instead.</em>
708      * @param kernel The string to lookup, i.e. "amp".
709      * @param start The starting point in the string of the kernel.
710      * @param end The ending point in the string of the kernel.
711      * This should be the index of the semicolon if it exists, or failing that,
712      * at least an index past the last character of the kernel.
713      * @return The reference that matches the given string, or <code>null</code>
714      * if it wasn't found.
715      */

716     public static CharacterReference lookup (String JavaDoc kernel, int start, int end)
717     {
718         CharacterReferenceEx probe;
719         
720         probe = new CharacterReferenceEx ();
721         probe.setKernel (kernel);
722         probe.setStart (start);
723         probe.setEnd (end);
724
725         return (lookup (probe));
726     }
727
728     /**
729      * Convert a reference to a unicode character.
730      * Convert a single numeric character reference or character entity reference
731      * to a unicode character.
732      * @param string The string to convert. Of the form &xxxx; or &amp;#xxxx; with
733      * or without the leading ampersand or trailing semi-colon.
734      * @param start The starting pooint in the string to look for a character reference.
735      * @param end The ending point in the string to stop looking for a character reference.
736      * @return The converted character or '' (zero) if the string is an
737      * invalid reference.
738      * @deprecated Use {@link #decode(String) decode}.
739      */

740     public static char convertToChar (String JavaDoc string, int start, int end)
741     {
742         return (decode (string.substring (start, end)).charAt (0));
743     }
744
745     /**
746      * Convert a reference to a unicode character.
747      * Convert a single numeric character reference or character entity reference
748      * to a unicode character.
749      * @param string The string to convert. Of the form &xxxx; or &amp;#xxxx; with
750      * or without the leading ampersand or trailing semi-colon.
751      * @return The converted character or '' (zero) if the string is an
752      * invalid reference.
753      * @deprecated Use {@link #decode(String) decode}.
754      */

755     public static char convertToChar (String JavaDoc string)
756     {
757         return (decode (string).charAt (0));
758     }
759
760     /**
761      * Decode a string containing references.
762      * Change all numeric character reference and character entity references
763      * to unicode characters.
764      * @param string The string to translate.
765      */

766     public static String JavaDoc decode (String JavaDoc string)
767     {
768         CharacterReferenceEx key;
769         int amp;
770         int index;
771         int length;
772         StringBuffer JavaDoc buffer;
773         char character;
774         int number;
775         int radix;
776         int i;
777         int semi;
778         boolean done;
779         CharacterReference item;
780         String JavaDoc ret;
781
782         if (-1 == (amp = string.indexOf ('&')))
783             ret = string;
784         else
785         {
786             key = null;
787             index = 0;
788             length = string.length ();
789             buffer = new StringBuffer JavaDoc (length);
790             do
791             {
792                 // equivalent to buffer.append (string.substring (index, amp));
793
// but without the allocation of a new String
794
while (index < amp)
795                     buffer.append (string.charAt (index++));
796                 
797                 index++;
798                 if (index < length)
799                 {
800                     character = string.charAt (index);
801                     if ('#' == character)
802                     {
803                         // numeric character reference
804
index++;
805                         number = 0;
806                         radix = 0;
807                         i = index;
808                         done = false;
809                         while ((i < length) && !done)
810                         {
811                             character = string.charAt (i);
812                             switch (character)
813                             {
814                                 case '0':
815                                 case '1':
816                                 case '2':
817                                 case '3':
818                                 case '4':
819                                 case '5':
820                                 case '6':
821                                 case '7':
822                                 case '8':
823                                 case '9':
824                                     if (0 == radix)
825                                         radix = 10;
826                                     number = number * radix + (character - '0');
827                                     break;
828                                 case 'A':
829                                 case 'B':
830                                 case 'C':
831                                 case 'D':
832                                 case 'E':
833                                 case 'F':
834                                     if (16 == radix)
835                                         number = number * radix + (character - 'A' + 10);
836                                     else
837                                         done = true;
838                                     break;
839                                 case 'a':
840                                 case 'b':
841                                 case 'c':
842                                 case 'd':
843                                 case 'e':
844                                 case 'f':
845                                     if (16 == radix)
846                                         number = number * radix + (character - 'a' + 10);
847                                     else
848                                         done = true;
849                                     break;
850                                 case 'x':
851                                 case 'X':
852                                     if (0 == radix)
853                                         radix = 16;
854                                     else
855                                         done = true;
856                                     break;
857                                 case ';':
858                                     done = true;
859                                     i++;
860                                     break;
861                                 default:
862                                     done = true;
863                                     break;
864                             }
865                             if (!done)
866                                 i++;
867                         }
868                         if (0 != number)
869                         {
870                             buffer.append ((char)number);
871                             index = i;
872                             amp = index;
873                         }
874                         
875                     }
876                     else if (Character.isLetter (character)) // really can't start with a digit eh...
877
{
878                         // character entity reference
879
i = index + 1;
880                         done = false;
881                         semi = length;
882                         while ((i < length) && !done)
883                         {
884                             character = string.charAt (i);
885                             if (';' == character)
886                             {
887                                 done = true;
888                                 semi = i;
889                                 i++;
890                             }
891                             else if (Character.isLetterOrDigit (character))
892                                 i++;
893                             else
894                             {
895                                 done = true;
896                                 semi = i;
897                             }
898                         }
899                         // new CharacterReference (string.substring (index, semi), 0);
900
if (null == key)
901                             key = new CharacterReferenceEx ();
902                         key.setKernel (string);
903                         key.setStart (index);
904                         key.setEnd (semi);
905                         item = lookup (key);
906                         if (null != item)
907                         {
908                             buffer.append ((char)item.getCharacter ());
909                             index += item.getKernel ().length ();
910                             if ((index < length) && (';' == string.charAt (index)))
911                                 index++;
912                             amp = index;
913                         }
914                     }
915                     else
916                     {
917                         // need do nothing here, the ampersand will be consumed below
918
}
919                 }
920                 // gather up unconsumed characters
921
while (amp < index)
922                     buffer.append (string.charAt (amp++));
923             }
924             while ((index < length) && (-1 != (amp = string.indexOf ('&', index))));
925             // equivalent to buffer.append (string.substring (index));
926
// but without the allocation of a new String
927
while (index < length)
928                 buffer.append (string.charAt (index++));
929             ret = buffer.toString ();
930         }
931
932         return (ret);
933     }
934
935     /**
936      * Decode the characters in a string buffer containing references.
937      * Change all numeric character reference and character entity references
938      * to unicode characters.
939      * @param buffer The StringBuffer containing references.
940      * @return The decoded string.
941      */

942     public static String JavaDoc decode (StringBuffer JavaDoc buffer)
943     {
944         return decode (buffer.toString());
945     }
946
947     /**
948      * Decode a stream containing references.
949      * Change all numeric character reference and character entity references
950      * to unicode characters. If <code>DECODE_LINE_BY_LINE</code> is true,
951      * the input stream is broken up into lines, terminated by either
952      * carriage return or newline, in order to reduce the latency and maximum
953      * buffering memory size required.
954      * @param in The stream to translate. It is assumed that the input
955      * stream is encoded with ISO-8859-1 since the table of character
956      * entity references in this class applies only to ISO-8859-1.
957      * @param out The stream to write the decoded stream to.
958      */

959     public static void decode (InputStream JavaDoc in, PrintStream JavaDoc out)
960     {
961         Reader JavaDoc reader;
962         StringBuffer JavaDoc buffer;
963         int character;
964         String JavaDoc string;
965         boolean newlines;
966
967         try
968         {
969             try
970             {
971                 reader = new BufferedReader JavaDoc (new InputStreamReader JavaDoc (in, "ISO-8859-1"));
972             }
973             catch (UnsupportedEncodingException JavaDoc use)
974             {
975                 // yeah, like this will happen; OK, assume the default is ISO-8859-1
976
reader = new BufferedReader JavaDoc (new InputStreamReader JavaDoc (in));
977             }
978             buffer = new StringBuffer JavaDoc (1024);
979             newlines = false;
980             if (DECODE_LINE_BY_LINE)
981                 while (-1 != (character = reader.read ()))
982                 {
983                     if (('\r' == character) || ('\n' == character))
984                     {
985                         if (!newlines)
986                         {
987                             string = decode (buffer.toString ());
988                             out.print (string);
989                             buffer.setLength (0);
990                             newlines = true;
991                         }
992                         buffer.append ((char)character);
993                     }
994                     else
995                     {
996                         if (newlines)
997                         {
998                             out.print (buffer.toString ());
999                             buffer.setLength (0);
1000                            newlines = false;
1001                        }
1002                        buffer.append ((char)character);
1003                    }
1004                }
1005            else
1006                while (-1 != (character = reader.read ()))
1007                    buffer.append ((char)character);
1008            if (0 != buffer.length ())
1009            {
1010                if (newlines)
1011                    out.print (buffer.toString ());
1012                else
1013                {
1014                    string = decode (buffer.toString ());
1015                    out.print (string);
1016                }
1017            }
1018        }
1019        catch (IOException JavaDoc ioe)
1020        {
1021            out.println ();
1022            out.println (ioe.getMessage ());
1023        }
1024        finally
1025        {
1026            out.flush ();
1027        }
1028    }
1029
1030    /**
1031     * Convert a character to a numeric character reference.
1032     * Convert a unicode character to a numeric character reference of
1033     * the form &amp;#xxxx;.
1034     * @param character The character to convert.
1035     * @return The converted character.
1036     * @deprecated Use {@link #encode(int) encode}.
1037     */

1038    public static String JavaDoc convertToString (int character)
1039    {
1040        return (encode (character));
1041    }
1042
1043    /**
1044     * Convert a character to a numeric character reference.
1045     * Convert a unicode character to a numeric character reference of
1046     * the form &amp;#xxxx;.
1047     * @param character The character to convert.
1048     * @return The converted character.
1049     */

1050    public static String JavaDoc encode (int character)
1051    {
1052        StringBuffer JavaDoc ret;
1053
1054        ret = new StringBuffer JavaDoc (13); /* &#2147483647; */
1055        ret.append ("&#");
1056        if (ENCODE_HEXADECIMAL)
1057        {
1058            ret.append ("x");
1059            ret.append (Integer.toHexString (character));
1060        }
1061        else
1062            ret.append (character);
1063        ret.append (';');
1064
1065        return (ret.toString ());
1066    }
1067    
1068    /**
1069     * Encode a string to use references.
1070     * Change all characters that are not ISO-8859-1 to their numeric character
1071     * reference or character entity reference.
1072     * @param string The string to translate.
1073     * @return The encoded string.
1074     */

1075    public static String JavaDoc encode (String JavaDoc string)
1076    {
1077        int length;
1078        char c;
1079        CharacterReference candidate;
1080        StringBuffer JavaDoc ret;
1081
1082        ret = new StringBuffer JavaDoc (string.length () * 6);
1083        length = string.length ();
1084        for (int i = 0; i < length; i++)
1085        {
1086            c = string.charAt (i);
1087            candidate = lookup (c);
1088            if (null != candidate)
1089            {
1090                ret.append ('&');
1091                ret.append (candidate.getKernel ());
1092                ret.append (';');
1093            }
1094            else if (!(c < 0x007F))
1095            {
1096                ret.append ("&#");
1097                if (ENCODE_HEXADECIMAL)
1098                {
1099                    ret.append ("x");
1100                    ret.append (Integer.toHexString (c));
1101                }
1102                else
1103                    ret.append ((int)c);
1104                ret.append (';');
1105            }
1106            else
1107                ret.append (c);
1108        }
1109
1110        return (ret.toString ());
1111    }
1112
1113    /**
1114     * Encode a stream to use references.
1115     * Change all characters that are not ISO-8859-1 to their numeric character
1116     * reference or character entity reference.
1117     * @param in The stream to translate. It is assumed that the input
1118     * stream is encoded with ISO-8859-1 since the table of character
1119     * entity references in this class applies only to ISO-8859-1.
1120     * @param out The stream to write the decoded stream to.
1121     */

1122    public static void encode (InputStream JavaDoc in, PrintStream JavaDoc out)
1123    {
1124        Reader JavaDoc reader;
1125        char c;
1126        int index;
1127        CharacterReference candidate;
1128        PrintWriter JavaDoc output;
1129
1130        try
1131        {
1132            reader = new BufferedReader JavaDoc (new InputStreamReader JavaDoc (in, "ISO-8859-1"));
1133            output = new PrintWriter JavaDoc (new BufferedWriter JavaDoc (new OutputStreamWriter JavaDoc (out, "ISO-8859-1")));
1134        }
1135        catch (UnsupportedEncodingException JavaDoc use)
1136        {
1137            // yeah, like this will happen; OK, assume default is ISO-8859-1
1138
reader = new BufferedReader JavaDoc (new InputStreamReader JavaDoc (in));
1139            output = new PrintWriter JavaDoc (new BufferedWriter JavaDoc (new OutputStreamWriter JavaDoc (out)));
1140        }
1141        try
1142        {
1143            while (-1 != (index = reader.read ()))
1144            {
1145                c = (char)index;
1146                candidate = lookup (c);
1147                if (null != candidate)
1148                {
1149                    output.print ('&');
1150                    output.print (candidate.getKernel ());
1151                    output.print (';');
1152                }
1153                else if (!(c < 0x007F))
1154                {
1155                    output.print ("&#");
1156                    if (ENCODE_HEXADECIMAL)
1157                    {
1158                        output.print ("x");
1159                        output.print (Integer.toHexString (c));
1160                    }
1161                    else
1162                        output.print ((int)c);
1163                    output.print (';');
1164                }
1165                else
1166                    output.print (c);
1167            }
1168        }
1169        catch (IOException JavaDoc ioe)
1170        {
1171            output.println ();
1172            output.println (ioe.getMessage ());
1173        }
1174        finally
1175        {
1176            output.flush ();
1177        }
1178    }
1179
1180    /**
1181     * Numeric character reference and character entity reference to unicode codec.
1182     * Translate the <code>System.in</code> input into an encoded or decoded
1183     * stream and send the results to <code>System.out</code>.
1184     * @param args If arg[0] is <code>-encode</code> perform an encoding on
1185     * <code>System.in</code>, otherwise perform a decoding.
1186     */

1187    public static void main (String JavaDoc[] args)
1188    {
1189        boolean encode;
1190
1191        if (0 < args.length && args[0].equalsIgnoreCase ("-encode"))
1192            encode = true;
1193        else
1194            encode = false;
1195        if (encode)
1196            encode (System.in, System.out);
1197        else
1198            decode (System.in, System.out);
1199    }
1200}
1201
Popular Tags