HTMLUtil


1   /* ====================================================================
2    * The Jcorporate Apache Style Software License, Version 1.2 05-07-2002
3    *
4    * Copyright (c) 1995-2002 Jcorporate Ltd. All rights reserved.
5    *
6    * Redistribution and use in source and binary forms, with or without
7    * modification, are permitted provided that the following conditions
8    * are met:
9    *
10   * 1. Redistributions of source code must retain the above copyright
11   *    notice, this list of conditions and the following disclaimer.
12   *
13   * 2. Redistributions in binary form must reproduce the above copyright
14   *    notice, this list of conditions and the following disclaimer in
15   *    the documentation and/or other materials provided with the
16   *    distribution.
17   *
18   * 3. The end-user documentation included with the redistribution,
19   *    if any, must include the following acknowledgment:
20   *       "This product includes software developed by Jcorporate Ltd.
21   *        (http://www.jcorporate.com/)."
22   *    Alternately, this acknowledgment may appear in the software itself,
23   *    if and wherever such third-party acknowledgments normally appear.
24   *
25   * 4. "Jcorporate" and product names such as "Expresso" must
26   *    not be used to endorse or promote products derived from this
27   *    software without prior written permission. For written permission,
28   *    please contact info@jcorporate.com.
29   *
30   * 5. Products derived from this software may not be called "Expresso",
31   *    or other Jcorporate product names; nor may "Expresso" or other
32   *    Jcorporate product names appear in their name, without prior
33   *    written permission of Jcorporate Ltd.
34   *
35   * 6. No product derived from this software may compete in the same
36   *    market space, i.e. framework, without prior written permission
37   *    of Jcorporate Ltd. For written permission, please contact
38   *    partners@jcorporate.com.
39   *
40   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43   * DISCLAIMED.  IN NO EVENT SHALL JCORPORATE LTD OR ITS CONTRIBUTORS
44   * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
45   * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
46   * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51   * SUCH DAMAGE.
52   * ====================================================================
53   *
54   * This software consists of voluntary contributions made by many
55   * individuals on behalf of the Jcorporate Ltd. Contributions back
56   * to the project(s) are encouraged when you make modifications.
57   * Please send them to support@jcorporate.com. For more information
58   * on Jcorporate Ltd. and its products, please see
59   * <http://www.jcorporate.com/>.
60   *
61   * Portions of this software are based upon other open source
62   * products and are subject to their respective licenses.
63   */
64  
65  package com.jcorporate.expresso.core.misc;
66  
67  import java.util.Arrays  ;
68  
69  /**
70   * Copyright 1999, 2002, 2002 Yves Henri AMAIZO.
71   * amy_amaizo@compuserve.com
72   * This class convert a text in an HTML text format with symbolic code (&xxxx;),
73   * it also convert a given HTML text format which contain symbolic code to text.
74   *
75   * @author Yves Henri AMAIZO
76   */
77  public class HTMLUtil {
78  
79      /**
80       * prevent instantiation
81       */
82      private HTMLUtil() {
83      };
84  
85      /**
86       * Method text2html: Convert a text to an HTML format.
87       *
88       * @param text: The original text string
89       * @return The converted HTML text including symbolic codes string
90       */
91      public static String   text2html(String   text) {
92          if (text == null) {
93              return text;
94          }
95          StringBuffer   t = new StringBuffer  (text.length() + 10); // 10 is just a test value, could be anything, should affect performance
96          for (int i = 0; i < text.length(); i++) {
97              char c = text.charAt(i);
98              // Check for non ISO8859-1 characters
99              if ((int) c < SYMBOLIC_CODE.length) { // Maybe slower than  "(int)c & 0xFF != 0" but more evolutive
100                 String   sc = SYMBOLIC_CODE[(int) c];
101                 if ("".equals(sc)) {
102                     t = t.append(c);
103                 } else {
104                     t = t.append(sc);
105                 }
106             } else {
107                 t = t.append(c);
108             }
109         }
110         return t.toString();
111     }
112 
113     /**
114      * Method html2text: Convert an HTML text format to a normal text format.
115      *
116      * @param text: The original HTML text string
117      * @return The converted text without symbolic codes string
118      */
119     public static String   html2text(String   text) {
120         if (text == null) {
121             return text;
122         }
123         StringBuffer   t = new StringBuffer  (text.length());
124         initSortedArray();
125         for (int i = 0; i < text.length(); i++) {
126             char c = text.charAt(i);
127             if (c == '&') {
128                 String   code = String.valueOf(c);
129                 do {
130                     if (++i >= text.length()) {
131                         break;
132                     }
133                     if (text.charAt(i) == '&') {
134                         i--;
135                         break;
136                     }
137                     code += text.charAt(i);
138                 } while (text.charAt(i) != ';');
139 
140                 int index = Arrays.binarySearch(sortedSymbolicCode,
141                         new NumericSymbolicCode(code, 0));
142                 // Does the extracting code correspond to something ?
143                 if (index >= 0) {
144                     t = t.append((char) sortedSymbolicCode[index].getNumericCode());
145                 } else {
146                     t = t.append(code);
147                 }
148             } else {
149                 t = t.append(c);
150             }
151         }
152         return t.toString();
153     }
154 
155     /**
156      * Initialization and sorting of the 'sortedSymbolicCode'
157      */
158     private static void initSortedArray() {
159         if (sortedSymbolicCode == null) {
160             sortedSymbolicCode = new NumericSymbolicCode[SYMBOLIC_CODE.length];
161 
162             for (int i = 0; i < SYMBOLIC_CODE.length; i++) {
163                 sortedSymbolicCode[i] = new NumericSymbolicCode(SYMBOLIC_CODE[i], i);
164             }
165             Arrays.sort(sortedSymbolicCode);
166         }
167     }
168 
169     /**
170      * Array of symbolic code order by numeric code ! <br>
171      * The symbolic codes and their position correspond to the ISO 8859-1 set
172      * of char. The empty definitions mean that there is no symbolic codes for
173      * that character or this symbolic code is not used.
174      */
175     private static final String  [] SYMBOLIC_CODE = {
176         // 0
177         "", "", "", "", "", "", "", "", "", "",
178         // 10
179         "<br>", "", "", "", "", "", "", "", "", "",
180         // 20
181         "", "", "", "", "",
182         "&#25;", // yen sign
183         "", "", "", "",
184         // 30
185         "", "", "", "",
186         "&quot;", // quotation mark
187         "", "", "", "", "&#39;",
188         // 40
189         "", "", "", "", "", "", "", "", "", "",
190         // 50
191         "", "", "", "", "", "", "", "", "", "",
192         // 60
193         "", "", "", "",
194         "&#64;", // commercial at
195         "", "", "", "", "",
196         // 70
197         "", "", "", "", "", "", "", "", "", "",
198         // 80
199         "", "", "", "", "", "", "", "", "", "",
200         // 90
201         "", "", "", "", "", "",
202         "&#96;", // grave accent
203         "", "", "",
204         // 100
205         "", "", "", "", "", "", "", "", "", "",
206         // 110-130
207         "", "", "", "", "", "", "", "", "", "",
208         "", "", "", "", "", "", "", "", "&#128;", "",
209         "", "", "", "", "", "", "", "", "", "",
210         // 140
211         "", "", "", "", "", "&#145;",
212         "&#146;", // other apostrophe
213         "&#147;", "&#148;", "",
214         // 150
215         "", "", "", "", "", "", "", "", "", "",
216         // 160
217         "", // non breaking space (should be &nbsp;)
218         "&iexcl;", // invertedexclamation sign
219         "&cent;", // cent sign
220         "&pound;", // pound sterling sign
221         "&curren;", // general currency sign
222         "&yen;", // yen sign
223         "&brvbar;", // broken vertical bar
224         "&sect;", // section sign (legal)
225         "&uml;", // umlaut (dieresis)
226         "&copy;", // copyright
227         // 170
228         "&ordf;", // feminine ordinal
229         "&laquo;", // guillemot left
230         "&not;", // not sign
231         "&shy;", // soft hyphen
232         "&reg;", // registered trademark
233         "&macr;", // macron accent
234         "&deg;", // degree sign
235         "&plusmn;", // plus or minus
236         "&sup2;", // raised to square(superscript two)
237         "&sup3;", // superscript three
238         // 180
239         "&acute;", // acute accent
240         "&micro;", // micron sign
241         "&para", // paragraph sign, Pi
242         "&middot;", // middle dot
243         "&cedil;", // cedilla mark
244         "&supl;", // raised to one(superscript one)
245         "&ordm;", // masculine ordinal
246         "&raquo;", // guillemot right
247         "&frac14;", // one-forth fraction
248         "&frac12;", // half fraction
249         // 190
250         "&frac34;", // three-forths fraction
251         "&iquest;", // inverted question mark
252         "&Agrave;", // A with grave accent
253         "&Aacute;", // A with acute accent
254         "&Acirc;", // A with circumflex accent
255         "&Atilde;", // A with tilde accent
256         "&Auml;", // A with angstrom
257         "&Aring;", // A with umlaut mark
258         "&AElig;", // AE dipthong (ligature)
259         "&Ccedil;", // C with cedilla mark
260         // 200
261         "&Egrave;", // E with grave accent
262         "&Eacute;", // E with acute accent
263         "&Ecirc;", // E with circumflex accent
264         "&Euml;", // E with umlaut mark
265         "&Igrave;", // I with grave accent
266         "&Iacute;", // I with acute accent
267         "&Icirc;", // I with circumflex accent
268         "&Iuml;", // I with umlaut mark
269         "&ETH;", // Icelandic Capital Eth
270         "&Ntilde;", // N with tilde accent
271         // 210
272         "&Ograve;", // O with grave accent
273         "&Oacute;", // O with acute accent
274         "&Ocirc;", // O with circumflex accent
275         "&Otilde;", // O with tilde accent
276         "&Ouml;", // O with umlaut mark
277         "&times;", // multiply sign
278         "&Oslash;", // O slash
279         "&Ugrave;", // U with grave accent
280         "&Uacute;", // U with acute accent
281         "&Ucirc;", // U with circumflex accent
282         // 220
283         "&Uuml;", // U with umlaut mark
284         "&Yacute;", // Y with acute accent
285         "&THORN;", // Icelandic Capital Thorn
286         "&szlig;", // small sharp s(sz ligature)
287         "&agrave;", // a with grave accent
288         "&aacute;", // a with acute accent
289         "&acirc;", // a with circumflex accent
290         "&atilde;", // a with tilde accent
291         "&auml;", // a with angstrom
292         "&aring;", // a with umlaut mark
293         // 230
294         "&aelig;", // ae dipthong (ligature)
295         "&ccedil;", // c with cedilla mark
296         "&egrave;", // e with grave accent
297         "&eacute;", // e with acute accent
298         "&ecirc;", // e with circumflex accent
299         "&euml;", // e with umlaut mark
300         "&igrave;", // i with grave accent
301         "&iacute;", // i with acute accent
302         "&icirc;", // i with circumflex accent
303         "&iuml;", // i with umlaut mark
304         // 240
305         "&eth;", // Icelandic small eth
306         "&ntilde;", // n with tilde accent
307         "&ograve", // o with grave accent
308         "&oacute;", // o with acute accent
309         "&ocirc;", // o with circumflex accent
310         "&otilde", // o with tilde accent
311         "&ouml;", // o with umlaut mark
312         "&divide;", // divide sign
313         "&oslash;", // o slash
314         "&ugrave;", // u with grave accent
315         // 250
316         "&uacute;", // u with acute accent
317         "&ucirc;", // u with circumflex accent
318         "&uuml;", // u with umlaut mark
319         "&yacute;", // y with acute accent
320         "&thorn;", // Icelandic small thorn
321         "&yuml;", // y with umlaut mark
322     };
323 
324     /**
325      * Array of symbolic code order symbolic code !<br>
326      * This array is the reciprocal from the 'SYMBOLIC_CODE' array.
327      */
328     private static NumericSymbolicCode[] sortedSymbolicCode = null;
329 
330     /**
331      * This class is the structure used for the 'sortedSymbolicCode' array.
332      * Each symbolic code string (sorted by alphabetical order) have its numerical
333      * corresponding code.<br>
334      * This class also implements the 'Comparable' interface to ease the sorting
335      * process in the initialisation bloc.
336      */
337     final private static class NumericSymbolicCode implements Comparable   {
338 
339         public NumericSymbolicCode(String   symbolicCode, int numericCode) {
340             this.symbolicCode = symbolicCode;
341             this.numericCode = numericCode;
342         }
343 
344         public String   getSymbolicCode() {
345             return symbolicCode;
346         }
347 
348         public int getNumericCode() {
349             return numericCode;
350         }
351 
352         public int compareTo(Object   object) {
353             NumericSymbolicCode nsc = (NumericSymbolicCode) object;
354             return symbolicCode.compareTo(nsc.symbolicCode);
355         }
356 
357         private String   symbolicCode;
358         private int numericCode;
359     }
360 
361 }
362
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags