KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > jcorporate > expresso > core > misc > HTMLUtil


1 /* ====================================================================
2  * The Jcorporate Apache Style Software License, Version 1.2 05-07-2002
3  *
4  * Copyright (c) 1995-2002 Jcorporate Ltd. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * 3. The end-user documentation included with the redistribution,
19  * if any, must include the following acknowledgment:
20  * "This product includes software developed by Jcorporate Ltd.
21  * (http://www.jcorporate.com/)."
22  * Alternately, this acknowledgment may appear in the software itself,
23  * if and wherever such third-party acknowledgments normally appear.
24  *
25  * 4. "Jcorporate" and product names such as "Expresso" must
26  * not be used to endorse or promote products derived from this
27  * software without prior written permission. For written permission,
28  * please contact info@jcorporate.com.
29  *
30  * 5. Products derived from this software may not be called "Expresso",
31  * or other Jcorporate product names; nor may "Expresso" or other
32  * Jcorporate product names appear in their name, without prior
33  * written permission of Jcorporate Ltd.
34  *
35  * 6. No product derived from this software may compete in the same
36  * market space, i.e. framework, without prior written permission
37  * of Jcorporate Ltd. For written permission, please contact
38  * partners@jcorporate.com.
39  *
40  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43  * DISCLAIMED. IN NO EVENT SHALL JCORPORATE LTD OR ITS CONTRIBUTORS
44  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
45  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
46  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  * ====================================================================
53  *
54  * This software consists of voluntary contributions made by many
55  * individuals on behalf of the Jcorporate Ltd. Contributions back
56  * to the project(s) are encouraged when you make modifications.
57  * Please send them to support@jcorporate.com. For more information
58  * on Jcorporate Ltd. and its products, please see
59  * <http://www.jcorporate.com/>.
60  *
61  * Portions of this software are based upon other open source
62  * products and are subject to their respective licenses.
63  */

64
65 package com.jcorporate.expresso.core.misc;
66
67 import java.util.Arrays JavaDoc;
68
69 /**
70  * Copyright 1999, 2002, 2002 Yves Henri AMAIZO.
71  * amy_amaizo@compuserve.com
72  * This class convert a text in an HTML text format with symbolic code (&xxxx;),
73  * it also convert a given HTML text format which contain symbolic code to text.
74  *
75  * @author Yves Henri AMAIZO
76  */

77 public class HTMLUtil {
78
79     /**
80      * prevent instantiation
81      */

82     private HTMLUtil() {
83     };
84
85     /**
86      * Method text2html: Convert a text to an HTML format.
87      *
88      * @param text: The original text string
89      * @return The converted HTML text including symbolic codes string
90      */

91     public static String JavaDoc text2html(String JavaDoc text) {
92         if (text == null) {
93             return text;
94         }
95         StringBuffer JavaDoc t = new StringBuffer JavaDoc(text.length() + 10); // 10 is just a test value, could be anything, should affect performance
96
for (int i = 0; i < text.length(); i++) {
97             char c = text.charAt(i);
98             // Check for non ISO8859-1 characters
99
if ((int) c < SYMBOLIC_CODE.length) { // Maybe slower than "(int)c & 0xFF != 0" but more evolutive
100
String JavaDoc sc = SYMBOLIC_CODE[(int) c];
101                 if ("".equals(sc)) {
102                     t = t.append(c);
103                 } else {
104                     t = t.append(sc);
105                 }
106             } else {
107                 t = t.append(c);
108             }
109         }
110         return t.toString();
111     }
112
113     /**
114      * Method html2text: Convert an HTML text format to a normal text format.
115      *
116      * @param text: The original HTML text string
117      * @return The converted text without symbolic codes string
118      */

119     public static String JavaDoc html2text(String JavaDoc text) {
120         if (text == null) {
121             return text;
122         }
123         StringBuffer JavaDoc t = new StringBuffer JavaDoc(text.length());
124         initSortedArray();
125         for (int i = 0; i < text.length(); i++) {
126             char c = text.charAt(i);
127             if (c == '&') {
128                 String JavaDoc code = String.valueOf(c);
129                 do {
130                     if (++i >= text.length()) {
131                         break;
132                     }
133                     if (text.charAt(i) == '&') {
134                         i--;
135                         break;
136                     }
137                     code += text.charAt(i);
138                 } while (text.charAt(i) != ';');
139
140                 int index = Arrays.binarySearch(sortedSymbolicCode,
141                         new NumericSymbolicCode(code, 0));
142                 // Does the extracting code correspond to something ?
143
if (index >= 0) {
144                     t = t.append((char) sortedSymbolicCode[index].getNumericCode());
145                 } else {
146                     t = t.append(code);
147                 }
148             } else {
149                 t = t.append(c);
150             }
151         }
152         return t.toString();
153     }
154
155     /**
156      * Initialization and sorting of the 'sortedSymbolicCode'
157      */

158     private static void initSortedArray() {
159         if (sortedSymbolicCode == null) {
160             sortedSymbolicCode = new NumericSymbolicCode[SYMBOLIC_CODE.length];
161
162             for (int i = 0; i < SYMBOLIC_CODE.length; i++) {
163                 sortedSymbolicCode[i] = new NumericSymbolicCode(SYMBOLIC_CODE[i], i);
164             }
165             Arrays.sort(sortedSymbolicCode);
166         }
167     }
168
169     /**
170      * Array of symbolic code order by numeric code ! <br>
171      * The symbolic codes and their position correspond to the ISO 8859-1 set
172      * of char. The empty definitions mean that there is no symbolic codes for
173      * that character or this symbolic code is not used.
174      */

175     private static final String JavaDoc[] SYMBOLIC_CODE = {
176         // 0
177
"", "", "", "", "", "", "", "", "", "",
178         // 10
179
"<br>", "", "", "", "", "", "", "", "", "",
180         // 20
181
"", "", "", "", "",
182         "&#25;", // yen sign
183
"", "", "", "",
184         // 30
185
"", "", "", "",
186         "&quot;", // quotation mark
187
"", "", "", "", "&#39;",
188         // 40
189
"", "", "", "", "", "", "", "", "", "",
190         // 50
191
"", "", "", "", "", "", "", "", "", "",
192         // 60
193
"", "", "", "",
194         "&#64;", // commercial at
195
"", "", "", "", "",
196         // 70
197
"", "", "", "", "", "", "", "", "", "",
198         // 80
199
"", "", "", "", "", "", "", "", "", "",
200         // 90
201
"", "", "", "", "", "",
202         "&#96;", // grave accent
203
"", "", "",
204         // 100
205
"", "", "", "", "", "", "", "", "", "",
206         // 110-130
207
"", "", "", "", "", "", "", "", "", "",
208         "", "", "", "", "", "", "", "", "&#128;", "",
209         "", "", "", "", "", "", "", "", "", "",
210         // 140
211
"", "", "", "", "", "&#145;",
212         "&#146;", // other apostrophe
213
"&#147;", "&#148;", "",
214         // 150
215
"", "", "", "", "", "", "", "", "", "",
216         // 160
217
"", // non breaking space (should be &nbsp;)
218
"&iexcl;", // invertedexclamation sign
219
"&cent;", // cent sign
220
"&pound;", // pound sterling sign
221
"&curren;", // general currency sign
222
"&yen;", // yen sign
223
"&brvbar;", // broken vertical bar
224
"&sect;", // section sign (legal)
225
"&uml;", // umlaut (dieresis)
226
"&copy;", // copyright
227
// 170
228
"&ordf;", // feminine ordinal
229
"&laquo;", // guillemot left
230
"&not;", // not sign
231
"&shy;", // soft hyphen
232
"&reg;", // registered trademark
233
"&macr;", // macron accent
234
"&deg;", // degree sign
235
"&plusmn;", // plus or minus
236
"&sup2;", // raised to square(superscript two)
237
"&sup3;", // superscript three
238
// 180
239
"&acute;", // acute accent
240
"&micro;", // micron sign
241
"&para", // paragraph sign, Pi
242
"&middot;", // middle dot
243
"&cedil;", // cedilla mark
244
"&supl;", // raised to one(superscript one)
245
"&ordm;", // masculine ordinal
246
"&raquo;", // guillemot right
247
"&frac14;", // one-forth fraction
248
"&frac12;", // half fraction
249
// 190
250
"&frac34;", // three-forths fraction
251
"&iquest;", // inverted question mark
252
"&Agrave;", // A with grave accent
253
"&Aacute;", // A with acute accent
254
"&Acirc;", // A with circumflex accent
255
"&Atilde;", // A with tilde accent
256
"&Auml;", // A with angstrom
257
"&Aring;", // A with umlaut mark
258
"&AElig;", // AE dipthong (ligature)
259
"&Ccedil;", // C with cedilla mark
260
// 200
261
"&Egrave;", // E with grave accent
262
"&Eacute;", // E with acute accent
263
"&Ecirc;", // E with circumflex accent
264
"&Euml;", // E with umlaut mark
265
"&Igrave;", // I with grave accent
266
"&Iacute;", // I with acute accent
267
"&Icirc;", // I with circumflex accent
268
"&Iuml;", // I with umlaut mark
269
"&ETH;", // Icelandic Capital Eth
270
"&Ntilde;", // N with tilde accent
271
// 210
272
"&Ograve;", // O with grave accent
273
"&Oacute;", // O with acute accent
274
"&Ocirc;", // O with circumflex accent
275
"&Otilde;", // O with tilde accent
276
"&Ouml;", // O with umlaut mark
277
"&times;", // multiply sign
278
"&Oslash;", // O slash
279
"&Ugrave;", // U with grave accent
280
"&Uacute;", // U with acute accent
281
"&Ucirc;", // U with circumflex accent
282
// 220
283
"&Uuml;", // U with umlaut mark
284
"&Yacute;", // Y with acute accent
285
"&THORN;", // Icelandic Capital Thorn
286
"&szlig;", // small sharp s(sz ligature)
287
"&agrave;", // a with grave accent
288
"&aacute;", // a with acute accent
289
"&acirc;", // a with circumflex accent
290
"&atilde;", // a with tilde accent
291
"&auml;", // a with angstrom
292
"&aring;", // a with umlaut mark
293
// 230
294
"&aelig;", // ae dipthong (ligature)
295
"&ccedil;", // c with cedilla mark
296
"&egrave;", // e with grave accent
297
"&eacute;", // e with acute accent
298
"&ecirc;", // e with circumflex accent
299
"&euml;", // e with umlaut mark
300
"&igrave;", // i with grave accent
301
"&iacute;", // i with acute accent
302
"&icirc;", // i with circumflex accent
303
"&iuml;", // i with umlaut mark
304
// 240
305
"&eth;", // Icelandic small eth
306
"&ntilde;", // n with tilde accent
307
"&ograve", // o with grave accent
308
"&oacute;", // o with acute accent
309
"&ocirc;", // o with circumflex accent
310
"&otilde", // o with tilde accent
311
"&ouml;", // o with umlaut mark
312
"&divide;", // divide sign
313
"&oslash;", // o slash
314
"&ugrave;", // u with grave accent
315
// 250
316
"&uacute;", // u with acute accent
317
"&ucirc;", // u with circumflex accent
318
"&uuml;", // u with umlaut mark
319
"&yacute;", // y with acute accent
320
"&thorn;", // Icelandic small thorn
321
"&yuml;", // y with umlaut mark
322
};
323
324     /**
325      * Array of symbolic code order symbolic code !<br>
326      * This array is the reciprocal from the 'SYMBOLIC_CODE' array.
327      */

328     private static NumericSymbolicCode[] sortedSymbolicCode = null;
329
330     /**
331      * This class is the structure used for the 'sortedSymbolicCode' array.
332      * Each symbolic code string (sorted by alphabetical order) have its numerical
333      * corresponding code.<br>
334      * This class also implements the 'Comparable' interface to ease the sorting
335      * process in the initialisation bloc.
336      */

337     final private static class NumericSymbolicCode implements Comparable JavaDoc {
338
339         public NumericSymbolicCode(String JavaDoc symbolicCode, int numericCode) {
340             this.symbolicCode = symbolicCode;
341             this.numericCode = numericCode;
342         }
343
344         public String JavaDoc getSymbolicCode() {
345             return symbolicCode;
346         }
347
348         public int getNumericCode() {
349             return numericCode;
350         }
351
352         public int compareTo(Object JavaDoc object) {
353             NumericSymbolicCode nsc = (NumericSymbolicCode) object;
354             return symbolicCode.compareTo(nsc.symbolicCode);
355         }
356
357         private String JavaDoc symbolicCode;
358         private int numericCode;
359     }
360
361 }
362
Popular Tags