KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > i18n > CmsEncoder


1 /*
2  * File : $Source: /usr/local/cvs/opencms/src/org/opencms/i18n/CmsEncoder.java,v $
3  * Date : $Date: 2006/07/20 13:46:39 $
4  * Version: $Revision: 1.20 $
5  *
6  * This library is part of OpenCms -
7  * the Open Source Content Mananagement System
8  *
9  * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * For further information about Alkacon Software GmbH, please see the
22  * company website: http://www.alkacon.com
23  *
24  * For further information about OpenCms, please see the
25  * project website: http://www.opencms.org
26  *
27  * You should have received a copy of the GNU Lesser General Public
28  * License along with this library; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30  */

31
32 package org.opencms.i18n;
33
34 import org.opencms.main.CmsLog;
35 import org.opencms.main.OpenCms;
36 import org.opencms.util.CmsStringUtil;
37
38 import java.io.UnsupportedEncodingException JavaDoc;
39 import java.net.URLDecoder JavaDoc;
40 import java.net.URLEncoder JavaDoc;
41 import java.nio.CharBuffer JavaDoc;
42 import java.nio.charset.Charset JavaDoc;
43 import java.nio.charset.CharsetEncoder JavaDoc;
44 import java.util.HashMap JavaDoc;
45 import java.util.Map JavaDoc;
46 import java.util.regex.Matcher JavaDoc;
47 import java.util.regex.Pattern JavaDoc;
48
49 import org.apache.commons.logging.Log;
50
51 /**
52  * The OpenCms CmsEncoder class provides static methods to decode and encode data.<p>
53  *
54  * The methods in this class are substitutes for <code>java.net.URLEncoder.encode()</code> and
55  * <code>java.net.URLDecoder.decode()</code>. Use the methods from this class in all OpenCms
56  * core classes to ensure the encoding is always handled the same way.<p>
57  *
58  * The de- and encoding uses the same coding mechanism as JavaScript, special characters are
59  * replaxed with <code>%hex</code> where hex is a two digit hex number.<p>
60  *
61  * <b>Note:</b> On the client side (browser) instead of using corresponding <code>escape</code>
62  * and <code>unescape</code> JavaScript functions, better use <code>encodeURIComponent</code> and
63  * <code>decodeURIComponent</code> functions wich are work properly with unicode characters.
64  * These functions are supported in IE 5.5+ and NS 6+ only.<p>
65  *
66  * @author Alexander Kandzior
67  *
68  * @version $Revision: 1.20 $
69  *
70  * @since 6.0.0
71  */

72 public final class CmsEncoder {
73
74     /** Constant for the standard <code>ISO-8859-1</code> encoding. */
75     public static final String JavaDoc ENCODING_ISO_8859_1 = "ISO-8859-1";
76
77     /** Constant for the standard <code>US-ASCII</code> encoding. */
78     public static final String JavaDoc ENCODING_US_ASCII = "US-ASCII";
79
80     /**
81      * Constant for the standard <code>UTF-8</code> encoding.<p>
82      *
83      * Default encoding for JavaScript decodeUriComponent methods is <code>UTF-8</code> by w3c standard.
84      */

85     public static final String JavaDoc ENCODING_UTF_8 = "UTF-8";
86
87     /** The regex pattern to match HTML entities. */
88     private static final Pattern JavaDoc ENTITIY_PATTERN = Pattern.compile("\\&#\\d+;");
89
90     /** The prefix for HTML entities. */
91     private static final String JavaDoc ENTITY_PREFIX = "&#";
92
93     /** The replacement for HTML entity prefix in parameters. */
94     private static final String JavaDoc ENTITY_REPLACEMENT = "$$";
95
96     /** The log object for this class. */
97     private static final Log LOG = CmsLog.getLog(CmsEncoder.class);
98
99     /** A cache for encoding name lookup. */
100     private static Map JavaDoc m_encodingCache = new HashMap JavaDoc(16);
101
102     /** The plus entity. */
103     private static final String JavaDoc PLUS_ENTITY = ENTITY_PREFIX + "043;";
104
105     /**
106      * Constructor.<p>
107      */

108     private CmsEncoder() {
109
110         // empty
111
}
112
113     /**
114      * Adjusts the given String by making sure all characters that can be displayed
115      * in the given charset are contained as chars, whereas all other non-displayable
116      * characters are converted to HTML entities.<p>
117      *
118      * Just calls {@link #decodeHtmlEntities(String, String)} first and feeds the result
119      * to {@link #encodeHtmlEntities(String, String)}. <p>
120      *
121      * @param input the input to adjust the HTML encoding for
122      * @param encoding the charset to encode the result with
123      * @return the input with the decoded/encoded HTML entities
124      */

125     public static String JavaDoc adjustHtmlEncoding(String JavaDoc input, String JavaDoc encoding) {
126
127         return encodeHtmlEntities(decodeHtmlEntities(input, encoding), encoding);
128     }
129
130     /**
131      * Changes the encoding of a byte array that represents a String.<p>
132      *
133      * @param input the byte array to convert
134      * @param oldEncoding the current encoding of the byte array
135      * @param newEncoding the new encoding of the byte array
136      * @return byte[] the byte array encoded in the new encoding
137      */

138     public static byte[] changeEncoding(byte[] input, String JavaDoc oldEncoding, String JavaDoc newEncoding) {
139
140         if ((oldEncoding == null) || (newEncoding == null)) {
141             return input;
142         }
143         if (oldEncoding.trim().equalsIgnoreCase(newEncoding.trim())) {
144             return input;
145         }
146         byte[] result = input;
147         try {
148             result = (new String JavaDoc(input, oldEncoding)).getBytes(newEncoding);
149         } catch (UnsupportedEncodingException JavaDoc e) {
150             // return value will be input value
151
}
152         return result;
153     }
154
155     /**
156      * Creates a String out of a byte array with the specified encoding, falling back
157      * to the system default in case the encoding name is not valid.<p>
158      *
159      * Use this method as a replacement for <code>new String(byte[], encoding)</code>
160      * to avoid possible encoding problems.<p>
161      *
162      * @param bytes the bytes to decode
163      * @param encoding the encoding scheme to use for decoding the bytes
164      * @return the bytes decoded to a String
165      */

166     public static String JavaDoc createString(byte[] bytes, String JavaDoc encoding) {
167
168         if (encoding.intern() != OpenCms.getSystemInfo().getDefaultEncoding()) {
169             encoding = lookupEncoding(encoding, null);
170         }
171         if (encoding != null) {
172             try {
173                 return new String JavaDoc(bytes, encoding);
174             } catch (UnsupportedEncodingException JavaDoc e) {
175                 // this can _never_ happen since the charset was looked up first
176
}
177         } else {
178             if (LOG.isWarnEnabled()) {
179                 LOG.warn(Messages.get().getBundle().key(Messages.ERR_UNSUPPORTED_VM_ENCODING_1, encoding));
180             }
181             encoding = OpenCms.getSystemInfo().getDefaultEncoding();
182             try {
183                 return new String JavaDoc(bytes, encoding);
184             } catch (UnsupportedEncodingException JavaDoc e) {
185                 // this can also _never_ happen since the default encoding is always valid
186
}
187         }
188         // this code is unreachable in pratice
189
LOG.error(Messages.get().getBundle().key(Messages.ERR_ENCODING_ISSUES_1, encoding));
190         return null;
191     }
192
193     /**
194      * Decodes a String using UTF-8 encoding, which is the standard for http data transmission
195      * with GET ant POST requests.<p>
196      *
197      * @param source the String to decode
198      * @return String the decoded source String
199      */

200     public static String JavaDoc decode(String JavaDoc source) {
201
202         return decode(source, ENCODING_UTF_8);
203     }
204
205     /**
206      * This method is a substitute for <code>URLDecoder.decode()</code>.
207      * Use this in all OpenCms core classes to ensure the encoding is
208      * always handled the same way.<p>
209      *
210      * In case you don't know what encoding to use, set the value of
211      * the <code>encoding</code> parameter to <code>null</code>.
212      * This method will then default to UTF-8 encoding, which is propably the right one.<p>
213      *
214      * @param source The string to decode
215      * @param encoding The encoding to use (if null, the system default is used)
216      * @return The decoded source String
217      */

218     public static String JavaDoc decode(String JavaDoc source, String JavaDoc encoding) {
219
220         if (source == null) {
221             return null;
222         }
223         if (encoding != null) {
224             try {
225                 return URLDecoder.decode(source, encoding);
226             } catch (java.io.UnsupportedEncodingException JavaDoc e) {
227                 // will fallback to default
228
}
229         }
230         // fallback to default decoding
231
try {
232             return URLDecoder.decode(source, ENCODING_UTF_8);
233         } catch (java.io.UnsupportedEncodingException JavaDoc e) {
234             // ignore
235
}
236         return source;
237     }
238
239     /**
240      * Decodes HTML entity references like <code>&amp;#8364;</code> that are contained in the
241      * String to a regulat character, but only if that character is contained in the given
242      * encodings charset.<p>
243      *
244      * @param input the input to decode the HTML enties in
245      * @param encoding the charset to decode the input for
246      * @return the input with the decoded HTML entities
247      * @see #encodeHtmlEntities(String, String)
248      */

249     public static String JavaDoc decodeHtmlEntities(String JavaDoc input, String JavaDoc encoding) {
250
251         Matcher JavaDoc matcher = ENTITIY_PATTERN.matcher(input);
252         StringBuffer JavaDoc result = new StringBuffer JavaDoc(input.length());
253         Charset JavaDoc charset = Charset.forName(encoding);
254         CharsetEncoder JavaDoc encoder = charset.newEncoder();
255
256         while (matcher.find()) {
257             String JavaDoc entity = matcher.group();
258             String JavaDoc value = entity.substring(2, entity.length() - 1);
259             int c = Integer.valueOf(value).intValue();
260             if (c < 128) {
261                 // first 128 chars are contained in almost every charset
262
entity = new String JavaDoc(new char[] {(char)c});
263                 // this is intendend as performance improvement since
264
// the canEncode() operation appears quite CPU heavy
265
} else if (encoder.canEncode((char)c)) {
266                 // encoder can endoce this char
267
entity = new String JavaDoc(new char[] {(char)c});
268             }
269             matcher.appendReplacement(result, entity);
270         }
271         matcher.appendTail(result);
272         return result.toString();
273     }
274
275     /**
276      * Decodes a string used as parameter in an uri in a way independent of other encodings/decodings applied before.<p>
277      *
278      * @param input the encoded parameter string
279      * @return the decoded parameter string
280      * @see #encodeParameter(String)
281      */

282     public static String JavaDoc decodeParameter(String JavaDoc input) {
283
284         String JavaDoc result = CmsStringUtil.substitute(input, ENTITY_REPLACEMENT, ENTITY_PREFIX);
285         return CmsEncoder.decodeHtmlEntities(result, OpenCms.getSystemInfo().getDefaultEncoding());
286     }
287
288     /**
289      * Encodes a String using UTF-8 encoding, which is the standard for http data transmission
290      * with GET ant POST requests.<p>
291      *
292      * @param source the String to encode
293      * @return String the encoded source String
294      */

295     public static String JavaDoc encode(String JavaDoc source) {
296
297         return encode(source, ENCODING_UTF_8);
298     }
299
300     /**
301      * This method is a substitute for <code>URLEncoder.encode()</code>.
302      * Use this in all OpenCms core classes to ensure the encoding is
303      * always handled the same way.<p>
304      *
305      * In case you don't know what encoding to use, set the value of
306      * the <code>encoding</code> parameter to <code>null</code>.
307      * This method will then default to UTF-8 encoding, which is propably the right one.<p>
308      *
309      * @param source the String to encode
310      * @param encoding the encoding to use (if null, the system default is used)
311      * @return the encoded source String
312      */

313     public static String JavaDoc encode(String JavaDoc source, String JavaDoc encoding) {
314
315         if (source == null) {
316             return null;
317         }
318         if (encoding != null) {
319             try {
320                 return URLEncoder.encode(source, encoding);
321             } catch (java.io.UnsupportedEncodingException JavaDoc e) {
322                 // will fallback to default
323
}
324         }
325         // fallback to default encoding
326
try {
327             return URLEncoder.encode(source, ENCODING_UTF_8);
328         } catch (java.io.UnsupportedEncodingException JavaDoc e) {
329             // ignore
330
}
331         return source;
332     }
333
334     /**
335      * Encodes all characters that are contained in the String which can not displayed
336      * in the given encodings charset with HTML entity references
337      * like <code>&amp;#8364;</code>.<p>
338      *
339      * This is required since a Java String is
340      * internally always stored as Unicode, meaning it can contain almost every character, but
341      * the HTML charset used might not support all such characters.<p>
342      *
343      * @param input the input to encode for HTML
344      * @param encoding the charset to encode the result with
345      * @return the input with the encoded HTML entities
346      * @see #decodeHtmlEntities(String, String)
347      */

348     public static String JavaDoc encodeHtmlEntities(String JavaDoc input, String JavaDoc encoding) {
349
350         StringBuffer JavaDoc result = new StringBuffer JavaDoc(input.length() * 2);
351         CharBuffer JavaDoc buffer = CharBuffer.wrap(input.toCharArray());
352         Charset JavaDoc charset = Charset.forName(encoding);
353         CharsetEncoder JavaDoc encoder = charset.newEncoder();
354         for (int i = 0; i < buffer.length(); i++) {
355             int c = buffer.get(i);
356             if (c < 128) {
357                 // first 128 chars are contained in almost every charset
358
result.append((char)c);
359                 // this is intendend as performance improvement since
360
// the canEncode() operation appears quite CPU heavy
361
} else if (encoder.canEncode((char)c)) {
362                 // encoder can endoce this char
363
result.append((char)c);
364             } else {
365                 // append HTML entiry reference
366
result.append(ENTITY_PREFIX);
367                 result.append(c);
368                 result.append(";");
369             }
370         }
371         return result.toString();
372     }
373
374     /**
375      * Encodes a string used as parameter in an uri in a way independent of other encodings/decodings applied later.<p>
376      *
377      * Used to ensure that GET parameters are not wrecked by wrong or incompatible configuration settings.
378      * In order to ensure this, the String is first encoded with html entities for any character that cannot encoded
379      * in US-ASCII; additionally, the plus sign is also encoded to avoid problems with the white-space replacer.
380      * Finally, the entity prefix is replaced with characters not used as delimiters in urls.<p>
381      *
382      * @param input the parameter string
383      * @return the encoded parameter string
384      */

385     public static String JavaDoc encodeParameter(String JavaDoc input) {
386
387         String JavaDoc result = CmsEncoder.encodeHtmlEntities(input, CmsEncoder.ENCODING_US_ASCII);
388         result = CmsStringUtil.substitute(result, "+", PLUS_ENTITY);
389         return CmsStringUtil.substitute(result, ENTITY_PREFIX, ENTITY_REPLACEMENT);
390     }
391
392     /**
393      * Encodes a String in a way that is compatible with the JavaScript escape function.
394      *
395      * @param source The textstring to be encoded.
396      * @param encoding the encoding type
397      * @return The JavaScript escaped string.
398      */

399     public static String JavaDoc escape(String JavaDoc source, String JavaDoc encoding) {
400
401         // the blank is encoded into "+" not "%20" when using standard encode call
402
return CmsStringUtil.substitute(encode(source, encoding), "+", "%20");
403     }
404
405     /**
406      * Escapes special characters in a HTML-String with their number-based
407      * entity representation, for example &amp; becomes &amp;#38;.<p>
408      *
409      * A character <code>num</code> is replaced if<br>
410      * <code>((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62)))</code><p>
411      *
412      * @param source the String to escape
413      * @return String the escaped String
414      *
415      * @see #escapeXml(String)
416      */

417     public static String JavaDoc escapeHtml(String JavaDoc source) {
418
419         int terminatorIndex;
420         if (source == null) {
421             return null;
422         }
423         StringBuffer JavaDoc result = new StringBuffer JavaDoc(source.length() * 2);
424         for (int i = 0; i < source.length(); i++) {
425             int ch = source.charAt(i);
426             // avoid escaping already escaped characters
427
if (ch == 38) {
428                 terminatorIndex = source.indexOf(";", i);
429                 if (terminatorIndex > 0) {
430                     if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+|lt|gt|amp|quote")) {
431                         result.append(source.substring(i, terminatorIndex + 1));
432                         // Skip remaining chars up to (and including) ";"
433
i = terminatorIndex;
434                         continue;
435                     }
436                 }
437             }
438             if ((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62))) {
439                 result.append(ENTITY_PREFIX);
440                 result.append(ch);
441                 result.append(";");
442             } else {
443                 result.append((char)ch);
444             }
445         }
446         return new String JavaDoc(result);
447     }
448
449     /**
450      * Escapes non ASCII characters in a HTML-String with their number-based
451      * entity representation, for example &amp; becomes &amp;#38;.<p>
452      *
453      * A character <code>num</code> is replaced if<br>
454      * <code>(ch > 255)</code><p>
455      *
456      * @param source the String to escape
457      * @return String the escaped String
458      *
459      * @see #escapeXml(String)
460      */

461     public static String JavaDoc escapeNonAscii(String JavaDoc source) {
462
463         if (source == null) {
464             return null;
465         }
466         StringBuffer JavaDoc result = new StringBuffer JavaDoc(source.length() * 2);
467         for (int i = 0; i < source.length(); i++) {
468             int ch = source.charAt(i);
469             if (ch > 255) {
470                 result.append(ENTITY_PREFIX);
471                 result.append(ch);
472                 result.append(";");
473             } else {
474                 result.append((char)ch);
475             }
476         }
477         return new String JavaDoc(result);
478     }
479
480     /**
481      * Encodes a String in a way that is compatible with the JavaScript escape function.
482      * Muliple blanks are encoded _multiply _with %20.
483      *
484      * @param source The textstring to be encoded.
485      * @param encoding the encoding type
486      * @return The JavaScript escaped string.
487      */

488     public static String JavaDoc escapeWBlanks(String JavaDoc source, String JavaDoc encoding) {
489
490         if (CmsStringUtil.isEmpty(source)) {
491             return source;
492         }
493         StringBuffer JavaDoc ret = new StringBuffer JavaDoc(source.length() * 2);
494
495         // URLEncode the text string
496
// this produces a very similar encoding to JavaSscript encoding,
497
// except the blank which is not encoded into "%20" instead of "+"
498

499         String JavaDoc enc = encode(source, encoding);
500         for (int z = 0; z < enc.length(); z++) {
501             char c = enc.charAt(z);
502             if (c == '+') {
503                 ret.append("%20");
504             } else {
505                 ret.append(c);
506             }
507         }
508         return ret.toString();
509     }
510
511     /**
512      * Escapes a String so it may be printed as text content or attribute
513      * value in a HTML page or an XML file.<p>
514      *
515      * This method replaces the following characters in a String:
516      * <ul>
517      * <li><b>&lt;</b> with &amp;lt;
518      * <li><b>&gt;</b> with &amp;gt;
519      * <li><b>&amp;</b> with &amp;amp;
520      * <li><b>&quot;</b> with &amp;quot;
521      * </ul>
522      *
523      * @param source the string to escape
524      * @return the escaped string
525      *
526      * @see #escapeHtml(String)
527      */

528     public static String JavaDoc escapeXml(String JavaDoc source) {
529
530         return escapeXml(source, false);
531     }
532
533     /**
534      * Escapes a String so it may be printed as text content or attribute
535      * value in a HTML page or an XML file.<p>
536      *
537      * This method replaces the following characters in a String:
538      * <ul>
539      * <li><b>&lt;</b> with &amp;lt;
540      * <li><b>&gt;</b> with &amp;gt;
541      * <li><b>&amp;</b> with &amp;amp;
542      * <li><b>&quot;</b> with &amp;quot;
543      * </ul>
544      *
545      * @param source the string to escape
546      * @param doubleEscape if <code>false</code>, all entities that already are escaped are left untouched
547      *
548      * @return the escaped string
549      *
550      * @see #escapeHtml(String)
551      */

552     public static String JavaDoc escapeXml(String JavaDoc source, boolean doubleEscape) {
553
554         if (source == null) {
555             return null;
556         }
557         StringBuffer JavaDoc result = new StringBuffer JavaDoc(source.length() * 2);
558
559         for (int i = 0; i < source.length(); ++i) {
560             char ch = source.charAt(i);
561             switch (ch) {
562                 case '<':
563                     result.append("&lt;");
564                     break;
565                 case '>':
566                     result.append("&gt;");
567                     break;
568                 case '&':
569                     // don't escape already escaped international and special characters
570
if (!doubleEscape) {
571                         int terminatorIndex = source.indexOf(";", i);
572                         if (terminatorIndex > 0) {
573                             if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+")) {
574                                 result.append(ch);
575                                 break;
576                             }
577                         }
578                     }
579                     // note that to other "break" in the above "if" block
580
result.append("&amp;");
581                     break;
582                 case '"':
583                     result.append("&quot;");
584                     break;
585                 default:
586                     result.append(ch);
587             }
588         }
589         return new String JavaDoc(result);
590     }
591
592     /**
593      * Checks if a given encoding name is actually supported, and if so
594      * resolves it to it's canonical name, if not it returns the given fallback
595      * value.<p>
596      *
597      * Charsets have a set of aliases. For example, valid aliases for "UTF-8"
598      * are "UTF8", "utf-8" or "utf8". This method resolves any given valid charset name
599      * to it's "canonical" form, so that simple String comparison can be used
600      * when checking charset names internally later.<p>
601      *
602      * Please see <a HREF="http://www.iana.org/assignments/character-sets">http://www.iana.org/assignments/character-sets</a>
603      * for a list of valid charset alias names.<p>
604      *
605      * @param encoding the encoding to check and resolve
606      * @param fallback the fallback encoding scheme
607      * @return the resolved encoding name, or the fallback value
608      */

609     public static String JavaDoc lookupEncoding(String JavaDoc encoding, String JavaDoc fallback) {
610
611         String JavaDoc result = (String JavaDoc)m_encodingCache.get(encoding);
612         if (result != null) {
613             return result;
614         }
615
616         try {
617             result = Charset.forName(encoding).name();
618             m_encodingCache.put(encoding, result);
619             return result;
620         } catch (Throwable JavaDoc t) {
621             // we will use the default value as fallback
622
}
623
624         return fallback;
625     }
626
627     /**
628      * Re-decodes a String that has not been correctly decoded and thus has scrambled
629      * character bytes.<p>
630      *
631      * This is an equivalent to the JavaScript "decodeURIComponent" function.
632      * It converts from the default "UTF-8" to the currently selected system encoding.<p>
633      *
634      * @param input the String to convert
635      * @return String the converted String
636      */

637     public static String JavaDoc redecodeUriComponent(String JavaDoc input) {
638
639         if (input == null) {
640             return input;
641         }
642         return new String JavaDoc(
643             changeEncoding(input.getBytes(), ENCODING_UTF_8, OpenCms.getSystemInfo().getDefaultEncoding()));
644     }
645
646     /**
647      * Decodes a String in a way that is compatible with the JavaScript
648      * unescape function.
649      *
650      * @param source The String to be decoded.
651      * @param encoding the encoding type
652      * @return The JavaScript unescaped String.
653      */

654     public static String JavaDoc unescape(String JavaDoc source, String JavaDoc encoding) {
655
656         if (source == null) {
657             return null;
658         }
659         int len = source.length();
660         // to use standard decoder we need to replace '+' with "%20" (space)
661
StringBuffer JavaDoc preparedSource = new StringBuffer JavaDoc(len);
662         for (int i = 0; i < len; i++) {
663             char c = source.charAt(i);
664             if (c == '+') {
665                 preparedSource.append("%20");
666             } else {
667                 preparedSource.append(c);
668             }
669         }
670         return decode(preparedSource.toString(), encoding);
671     }
672 }
Popular Tags