CmsEncoder


1   /*
2    * File   : $Source: /usr/local/cvs/opencms/src/org/opencms/i18n/CmsEncoder.java,v $
3    * Date   : $Date: 2006/07/20 13:46:39 $
4    * Version: $Revision: 1.20 $
5    *
6    * This library is part of OpenCms -
7    * the Open Source Content Mananagement System
8    *
9    * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10   *
11   * This library is free software; you can redistribute it and/or
12   * modify it under the terms of the GNU Lesser General Public
13   * License as published by the Free Software Foundation; either
14   * version 2.1 of the License, or (at your option) any later version.
15   *
16   * This library is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19   * Lesser General Public License for more details.
20   *
21   * For further information about Alkacon Software GmbH, please see the
22   * company website: http://www.alkacon.com
23   *
24   * For further information about OpenCms, please see the
25   * project website: http://www.opencms.org
26   * 
27   * You should have received a copy of the GNU Lesser General Public
28   * License along with this library; if not, write to the Free Software
29   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
30   */
31  
32  package org.opencms.i18n;
33  
34  import org.opencms.main.CmsLog;
35  import org.opencms.main.OpenCms;
36  import org.opencms.util.CmsStringUtil;
37  
38  import java.io.UnsupportedEncodingException  ;
39  import java.net.URLDecoder  ;
40  import java.net.URLEncoder  ;
41  import java.nio.CharBuffer  ;
42  import java.nio.charset.Charset  ;
43  import java.nio.charset.CharsetEncoder  ;
44  import java.util.HashMap  ;
45  import java.util.Map  ;
46  import java.util.regex.Matcher  ;
47  import java.util.regex.Pattern  ;
48  
49  import org.apache.commons.logging.Log;
50  
51  /**
52   * The OpenCms CmsEncoder class provides static methods to decode and encode data.<p>
53   * 
54   * The methods in this class are substitutes for <code>java.net.URLEncoder.encode()</code> and
55   * <code>java.net.URLDecoder.decode()</code>. Use the methods from this class in all OpenCms 
56   * core classes to ensure the encoding is always handled the same way.<p>
57   * 
58   * The de- and encoding uses the same coding mechanism as JavaScript, special characters are
59   * replaxed with <code>%hex</code> where hex is a two digit hex number.<p>
60   * 
61   * <b>Note:</b> On the client side (browser) instead of using corresponding <code>escape</code>
62   * and <code>unescape</code> JavaScript functions, better use <code>encodeURIComponent</code> and
63   * <code>decodeURIComponent</code> functions wich are work properly with unicode characters.
64   * These functions are supported in IE 5.5+ and NS 6+ only.<p>
65   *
66   * @author Alexander Kandzior 
67   * 
68   * @version $Revision: 1.20 $ 
69   * 
70   * @since 6.0.0 
71   */
72  public final class CmsEncoder {
73  
74      /** Constant for the standard <code>ISO-8859-1</code> encoding. */
75      public static final String   ENCODING_ISO_8859_1 = "ISO-8859-1";
76  
77      /** Constant for the standard <code>US-ASCII</code> encoding. */
78      public static final String   ENCODING_US_ASCII = "US-ASCII";
79  
80      /** 
81       * Constant for the standard <code>UTF-8</code> encoding.<p>
82       * 
83       * Default encoding for JavaScript decodeUriComponent methods is <code>UTF-8</code> by w3c standard. 
84       */
85      public static final String   ENCODING_UTF_8 = "UTF-8";
86  
87      /** The regex pattern to match HTML entities. */
88      private static final Pattern   ENTITIY_PATTERN = Pattern.compile("\\&#\\d+;");
89  
90      /** The prefix for HTML entities. */
91      private static final String   ENTITY_PREFIX = "&#";
92  
93      /** The replacement for HTML entity prefix in parameters. */
94      private static final String   ENTITY_REPLACEMENT = "$$";
95  
96      /** The log object for this class. */
97      private static final Log LOG = CmsLog.getLog(CmsEncoder.class);
98  
99      /** A cache for encoding name lookup. */
100     private static Map   m_encodingCache = new HashMap  (16);
101 
102     /** The plus entity. */
103     private static final String   PLUS_ENTITY = ENTITY_PREFIX + "043;";
104 
105     /**
106      * Constructor.<p>
107      */
108     private CmsEncoder() {
109 
110         // empty
111     }
112 
113     /**
114      * Adjusts the given String by making sure all characters that can be displayed 
115      * in the given charset are contained as chars, whereas all other non-displayable
116      * characters are converted to HTML entities.<p> 
117      * 
118      * Just calls {@link #decodeHtmlEntities(String, String)} first and feeds the result
119      * to {@link #encodeHtmlEntities(String, String)}. <p>
120      *  
121      * @param input the input to adjust the HTML encoding for
122      * @param encoding the charset to encode the result with
123      * @return the input with the decoded/encoded HTML entities
124      */
125     public static String   adjustHtmlEncoding(String   input, String   encoding) {
126 
127         return encodeHtmlEntities(decodeHtmlEntities(input, encoding), encoding);
128     }
129 
130     /**
131      * Changes the encoding of a byte array that represents a String.<p>
132      * 
133      * @param input the byte array to convert
134      * @param oldEncoding the current encoding of the byte array
135      * @param newEncoding the new encoding of the byte array
136      * @return byte[] the byte array encoded in the new encoding
137      */
138     public static byte[] changeEncoding(byte[] input, String   oldEncoding, String   newEncoding) {
139 
140         if ((oldEncoding == null) || (newEncoding == null)) {
141             return input;
142         }
143         if (oldEncoding.trim().equalsIgnoreCase(newEncoding.trim())) {
144             return input;
145         }
146         byte[] result = input;
147         try {
148             result = (new String  (input, oldEncoding)).getBytes(newEncoding);
149         } catch (UnsupportedEncodingException   e) {
150             // return value will be input value
151         }
152         return result;
153     }
154 
155     /**
156      * Creates a String out of a byte array with the specified encoding, falling back
157      * to the system default in case the encoding name is not valid.<p>
158      * 
159      * Use this method as a replacement for <code>new String(byte[], encoding)</code>
160      * to avoid possible encoding problems.<p>
161      * 
162      * @param bytes the bytes to decode 
163      * @param encoding the encoding scheme to use for decoding the bytes
164      * @return the bytes decoded to a String
165      */
166     public static String   createString(byte[] bytes, String   encoding) {
167 
168         if (encoding.intern() != OpenCms.getSystemInfo().getDefaultEncoding()) {
169             encoding = lookupEncoding(encoding, null);
170         }
171         if (encoding != null) {
172             try {
173                 return new String  (bytes, encoding);
174             } catch (UnsupportedEncodingException   e) {
175                 // this can _never_ happen since the charset was looked up first 
176             }
177         } else {
178             if (LOG.isWarnEnabled()) {
179                 LOG.warn(Messages.get().getBundle().key(Messages.ERR_UNSUPPORTED_VM_ENCODING_1, encoding));
180             }
181             encoding = OpenCms.getSystemInfo().getDefaultEncoding();
182             try {
183                 return new String  (bytes, encoding);
184             } catch (UnsupportedEncodingException   e) {
185                 // this can also _never_ happen since the default encoding is always valid
186             }
187         }
188         // this code is unreachable in pratice
189         LOG.error(Messages.get().getBundle().key(Messages.ERR_ENCODING_ISSUES_1, encoding));
190         return null;
191     }
192 
193     /**
194      * Decodes a String using UTF-8 encoding, which is the standard for http data transmission
195      * with GET ant POST requests.<p>
196      * 
197      * @param source the String to decode
198      * @return String the decoded source String
199      */
200     public static String   decode(String   source) {
201 
202         return decode(source, ENCODING_UTF_8);
203     }
204 
205     /**
206      * This method is a substitute for <code>URLDecoder.decode()</code>.
207      * Use this in all OpenCms core classes to ensure the encoding is
208      * always handled the same way.<p>
209      * 
210      * In case you don't know what encoding to use, set the value of 
211      * the <code>encoding</code> parameter to <code>null</code>. 
212      * This method will then default to UTF-8 encoding, which is propably the right one.<p>
213      * 
214      * @param source The string to decode
215      * @param encoding The encoding to use (if null, the system default is used)
216      * @return The decoded source String
217      */
218     public static String   decode(String   source, String   encoding) {
219 
220         if (source == null) {
221             return null;
222         }
223         if (encoding != null) {
224             try {
225                 return URLDecoder.decode(source, encoding);
226             } catch (java.io.UnsupportedEncodingException   e) {
227                 // will fallback to default
228             }
229         }
230         // fallback to default decoding
231         try {
232             return URLDecoder.decode(source, ENCODING_UTF_8);
233         } catch (java.io.UnsupportedEncodingException   e) {
234             // ignore
235         }
236         return source;
237     }
238 
239     /**
240      * Decodes HTML entity references like <code>&amp;#8364;</code> that are contained in the 
241      * String to a regulat character, but only if that character is contained in the given 
242      * encodings charset.<p> 
243      * 
244      * @param input the input to decode the HTML enties in
245      * @param encoding the charset to decode the input for
246      * @return the input with the decoded HTML entities
247      * @see #encodeHtmlEntities(String, String)
248      */
249     public static String   decodeHtmlEntities(String   input, String   encoding) {
250 
251         Matcher   matcher = ENTITIY_PATTERN.matcher(input);
252         StringBuffer   result = new StringBuffer  (input.length());
253         Charset   charset = Charset.forName(encoding);
254         CharsetEncoder   encoder = charset.newEncoder();
255 
256         while (matcher.find()) {
257             String   entity = matcher.group();
258             String   value = entity.substring(2, entity.length() - 1);
259             int c = Integer.valueOf(value).intValue();
260             if (c < 128) {
261                 // first 128 chars are contained in almost every charset
262                 entity = new String  (new char[] {(char)c});
263                 // this is intendend as performance improvement since 
264                 // the canEncode() operation appears quite CPU heavy
265             } else if (encoder.canEncode((char)c)) {
266                 // encoder can endoce this char
267                 entity = new String  (new char[] {(char)c});
268             }
269             matcher.appendReplacement(result, entity);
270         }
271         matcher.appendTail(result);
272         return result.toString();
273     }
274 
275     /**
276      * Decodes a string used as parameter in an uri in a way independent of other encodings/decodings applied before.<p>
277      * 
278      * @param input the encoded parameter string
279      * @return the decoded parameter string
280      * @see #encodeParameter(String)
281      */
282     public static String   decodeParameter(String   input) {
283 
284         String   result = CmsStringUtil.substitute(input, ENTITY_REPLACEMENT, ENTITY_PREFIX);
285         return CmsEncoder.decodeHtmlEntities(result, OpenCms.getSystemInfo().getDefaultEncoding());
286     }
287 
288     /**
289      * Encodes a String using UTF-8 encoding, which is the standard for http data transmission
290      * with GET ant POST requests.<p>
291      * 
292      * @param source the String to encode
293      * @return String the encoded source String
294      */
295     public static String   encode(String   source) {
296 
297         return encode(source, ENCODING_UTF_8);
298     }
299 
300     /**
301      * This method is a substitute for <code>URLEncoder.encode()</code>.
302      * Use this in all OpenCms core classes to ensure the encoding is
303      * always handled the same way.<p>
304      * 
305      * In case you don't know what encoding to use, set the value of 
306      * the <code>encoding</code> parameter to <code>null</code>. 
307      * This method will then default to UTF-8 encoding, which is propably the right one.<p>
308      * 
309      * @param source the String to encode
310      * @param encoding the encoding to use (if null, the system default is used)
311      * @return the encoded source String
312      */
313     public static String   encode(String   source, String   encoding) {
314 
315         if (source == null) {
316             return null;
317         }
318         if (encoding != null) {
319             try {
320                 return URLEncoder.encode(source, encoding);
321             } catch (java.io.UnsupportedEncodingException   e) {
322                 // will fallback to default
323             }
324         }
325         // fallback to default encoding
326         try {
327             return URLEncoder.encode(source, ENCODING_UTF_8);
328         } catch (java.io.UnsupportedEncodingException   e) {
329             // ignore
330         }
331         return source;
332     }
333 
334     /**
335      * Encodes all characters that are contained in the String which can not displayed 
336      * in the given encodings charset with HTML entity references
337      * like <code>&amp;#8364;</code>.<p>
338      * 
339      * This is required since a Java String is 
340      * internally always stored as Unicode, meaning it can contain almost every character, but 
341      * the HTML charset used might not support all such characters.<p>
342      * 
343      * @param input the input to encode for HTML
344      * @param encoding the charset to encode the result with
345      * @return the input with the encoded HTML entities
346      * @see #decodeHtmlEntities(String, String)
347      */
348     public static String   encodeHtmlEntities(String   input, String   encoding) {
349 
350         StringBuffer   result = new StringBuffer  (input.length() * 2);
351         CharBuffer   buffer = CharBuffer.wrap(input.toCharArray());
352         Charset   charset = Charset.forName(encoding);
353         CharsetEncoder   encoder = charset.newEncoder();
354         for (int i = 0; i < buffer.length(); i++) {
355             int c = buffer.get(i);
356             if (c < 128) {
357                 // first 128 chars are contained in almost every charset
358                 result.append((char)c);
359                 // this is intendend as performance improvement since 
360                 // the canEncode() operation appears quite CPU heavy
361             } else if (encoder.canEncode((char)c)) {
362                 // encoder can endoce this char
363                 result.append((char)c);
364             } else {
365                 // append HTML entiry reference
366                 result.append(ENTITY_PREFIX);
367                 result.append(c);
368                 result.append(";");
369             }
370         }
371         return result.toString();
372     }
373 
374     /**
375      * Encodes a string used as parameter in an uri in a way independent of other encodings/decodings applied later.<p>
376      * 
377      * Used to ensure that GET parameters are not wrecked by wrong or incompatible configuration settings.
378      * In order to ensure this, the String is first encoded with html entities for any character that cannot encoded
379      * in US-ASCII; additionally, the plus sign is also encoded to avoid problems with the white-space replacer.
380      * Finally, the entity prefix is replaced with characters not used as delimiters in urls.<p>
381      * 
382      * @param input the parameter string
383      * @return the encoded parameter string
384      */
385     public static String   encodeParameter(String   input) {
386 
387         String   result = CmsEncoder.encodeHtmlEntities(input, CmsEncoder.ENCODING_US_ASCII);
388         result = CmsStringUtil.substitute(result, "+", PLUS_ENTITY);
389         return CmsStringUtil.substitute(result, ENTITY_PREFIX, ENTITY_REPLACEMENT);
390     }
391 
392     /**
393      * Encodes a String in a way that is compatible with the JavaScript escape function.
394      * 
395      * @param source The textstring to be encoded.
396      * @param encoding the encoding type
397      * @return The JavaScript escaped string.
398      */
399     public static String   escape(String   source, String   encoding) {
400 
401         // the blank is encoded into "+" not "%20" when using standard encode call
402         return CmsStringUtil.substitute(encode(source, encoding), "+", "%20");
403     }
404 
405     /**
406      * Escapes special characters in a HTML-String with their number-based 
407      * entity representation, for example &amp; becomes &amp;#38;.<p>
408      * 
409      * A character <code>num</code> is replaced if<br>
410      * <code>((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62)))</code><p>
411      * 
412      * @param source the String to escape
413      * @return String the escaped String
414      * 
415      * @see #escapeXml(String)
416      */
417     public static String   escapeHtml(String   source) {
418 
419         int terminatorIndex;
420         if (source == null) {
421             return null;
422         }
423         StringBuffer   result = new StringBuffer  (source.length() * 2);
424         for (int i = 0; i < source.length(); i++) {
425             int ch = source.charAt(i);
426             // avoid escaping already escaped characters            
427             if (ch == 38) {
428                 terminatorIndex = source.indexOf(";", i);
429                 if (terminatorIndex > 0) {
430                     if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+|lt|gt|amp|quote")) {
431                         result.append(source.substring(i, terminatorIndex + 1));
432                         // Skip remaining chars up to (and including) ";"
433                         i = terminatorIndex;
434                         continue;
435                     }
436                 }
437             }
438             if ((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62))) {
439                 result.append(ENTITY_PREFIX);
440                 result.append(ch);
441                 result.append(";");
442             } else {
443                 result.append((char)ch);
444             }
445         }
446         return new String  (result);
447     }
448 
449     /**
450      * Escapes non ASCII characters in a HTML-String with their number-based 
451      * entity representation, for example &amp; becomes &amp;#38;.<p>
452      * 
453      * A character <code>num</code> is replaced if<br>
454      * <code>(ch > 255)</code><p>
455      * 
456      * @param source the String to escape
457      * @return String the escaped String
458      * 
459      * @see #escapeXml(String)
460      */
461     public static String   escapeNonAscii(String   source) {
462 
463         if (source == null) {
464             return null;
465         }
466         StringBuffer   result = new StringBuffer  (source.length() * 2);
467         for (int i = 0; i < source.length(); i++) {
468             int ch = source.charAt(i);
469             if (ch > 255) {
470                 result.append(ENTITY_PREFIX);
471                 result.append(ch);
472                 result.append(";");
473             } else {
474                 result.append((char)ch);
475             }
476         }
477         return new String  (result);
478     }
479 
480     /**
481      * Encodes a String in a way that is compatible with the JavaScript escape function.
482      * Muliple blanks are encoded _multiply _with %20.
483      * 
484      * @param source The textstring to be encoded.
485      * @param encoding the encoding type
486      * @return The JavaScript escaped string.
487      */
488     public static String   escapeWBlanks(String   source, String   encoding) {
489 
490         if (CmsStringUtil.isEmpty(source)) {
491             return source;
492         }
493         StringBuffer   ret = new StringBuffer  (source.length() * 2);
494 
495         // URLEncode the text string
496         // this produces a very similar encoding to JavaSscript encoding, 
497         // except the blank which is not encoded into "%20" instead of "+"
498 
499         String   enc = encode(source, encoding);
500         for (int z = 0; z < enc.length(); z++) {
501             char c = enc.charAt(z);
502             if (c == '+') {
503                 ret.append("%20");
504             } else {
505                 ret.append(c);
506             }
507         }
508         return ret.toString();
509     }
510 
511     /**
512      * Escapes a String so it may be printed as text content or attribute
513      * value in a HTML page or an XML file.<p>
514      * 
515      * This method replaces the following characters in a String:
516      * <ul>
517      * <li><b>&lt;</b> with &amp;lt;
518      * <li><b>&gt;</b> with &amp;gt;
519      * <li><b>&amp;</b> with &amp;amp;
520      * <li><b>&quot;</b> with &amp;quot;
521      * </ul>
522      * 
523      * @param source the string to escape
524      * @return the escaped string
525      * 
526      * @see #escapeHtml(String)
527      */
528     public static String   escapeXml(String   source) {
529 
530         return escapeXml(source, false);
531     }
532 
533     /**
534      * Escapes a String so it may be printed as text content or attribute
535      * value in a HTML page or an XML file.<p>
536      * 
537      * This method replaces the following characters in a String:
538      * <ul>
539      * <li><b>&lt;</b> with &amp;lt;
540      * <li><b>&gt;</b> with &amp;gt;
541      * <li><b>&amp;</b> with &amp;amp;
542      * <li><b>&quot;</b> with &amp;quot;
543      * </ul>
544      * 
545      * @param source the string to escape
546      * @param doubleEscape if <code>false</code>, all entities that already are escaped are left untouched
547      * 
548      * @return the escaped string
549      * 
550      * @see #escapeHtml(String)
551      */
552     public static String   escapeXml(String   source, boolean doubleEscape) {
553 
554         if (source == null) {
555             return null;
556         }
557         StringBuffer   result = new StringBuffer  (source.length() * 2);
558 
559         for (int i = 0; i < source.length(); ++i) {
560             char ch = source.charAt(i);
561             switch (ch) {
562                 case '<':
563                     result.append("&lt;");
564                     break;
565                 case '>':
566                     result.append("&gt;");
567                     break;
568                 case '&':
569                     // don't escape already escaped international and special characters
570                     if (!doubleEscape) {
571                         int terminatorIndex = source.indexOf(";", i);
572                         if (terminatorIndex > 0) {
573                             if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+")) {
574                                 result.append(ch);
575                                 break;
576                             }
577                         }
578                     }
579                     // note that to other "break" in the above "if" block
580                     result.append("&amp;");
581                     break;
582                 case '"':
583                     result.append("&quot;");
584                     break;
585                 default:
586                     result.append(ch);
587             }
588         }
589         return new String  (result);
590     }
591 
592     /**
593      * Checks if a given encoding name is actually supported, and if so
594      * resolves it to it's canonical name, if not it returns the given fallback 
595      * value.<p> 
596      * 
597      * Charsets have a set of aliases. For example, valid aliases for "UTF-8"
598      * are "UTF8", "utf-8" or "utf8". This method resolves any given valid charset name 
599      * to it's "canonical" form, so that simple String comparison can be used
600      * when checking charset names internally later.<p>
601      * 
602      * Please see <a HREF="http://www.iana.org/assignments/character-sets">http://www.iana.org/assignments/character-sets</a> 
603      * for a list of valid charset alias names.<p>
604      * 
605      * @param encoding the encoding to check and resolve
606      * @param fallback the fallback encoding scheme
607      * @return the resolved encoding name, or the fallback value
608      */
609     public static String   lookupEncoding(String   encoding, String   fallback) {
610 
611         String   result = (String  )m_encodingCache.get(encoding);
612         if (result != null) {
613             return result;
614         }
615 
616         try {
617             result = Charset.forName(encoding).name();
618             m_encodingCache.put(encoding, result);
619             return result;
620         } catch (Throwable   t) {
621             // we will use the default value as fallback
622         }
623 
624         return fallback;
625     }
626 
627     /**
628      * Re-decodes a String that has not been correctly decoded and thus has scrambled
629      * character bytes.<p>
630      * 
631      * This is an equivalent to the JavaScript "decodeURIComponent" function.
632      * It converts from the default "UTF-8" to the currently selected system encoding.<p>
633      * 
634      * @param input the String to convert
635      * @return String the converted String
636      */
637     public static String   redecodeUriComponent(String   input) {
638 
639         if (input == null) {
640             return input;
641         }
642         return new String  (
643             changeEncoding(input.getBytes(), ENCODING_UTF_8, OpenCms.getSystemInfo().getDefaultEncoding()));
644     }
645 
646     /**
647      * Decodes a String in a way that is compatible with the JavaScript 
648      * unescape function.
649      * 
650      * @param source The String to be decoded.
651      * @param encoding the encoding type
652      * @return The JavaScript unescaped String.
653      */
654     public static String   unescape(String   source, String   encoding) {
655 
656         if (source == null) {
657             return null;
658         }
659         int len = source.length();
660         // to use standard decoder we need to replace '+' with "%20" (space)
661         StringBuffer   preparedSource = new StringBuffer  (len);
662         for (int i = 0; i < len; i++) {
663             char c = source.charAt(i);
664             if (c == '+') {
665                 preparedSource.append("%20");
666             } else {
667                 preparedSource.append(c);
668             }
669         }
670         return decode(preparedSource.toString(), encoding);
671     }
672 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags