KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > java > net > URLEncoder


1 /*
2  * @(#)URLEncoder.java 1.31 06/12/19
3  *
4  * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
5  * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6  */

7
8 package java.net;
9
10 import java.io.ByteArrayOutputStream JavaDoc;
11 import java.io.BufferedWriter JavaDoc;
12 import java.io.OutputStreamWriter JavaDoc;
13 import java.io.IOException JavaDoc;
14 import java.io.UnsupportedEncodingException JavaDoc;
15 import java.io.CharArrayWriter JavaDoc;
16 import java.nio.charset.Charset JavaDoc;
17 import java.nio.charset.IllegalCharsetNameException JavaDoc;
18 import java.nio.charset.UnsupportedCharsetException JavaDoc;
19 import java.util.BitSet JavaDoc;
20 import java.security.AccessController JavaDoc;
21 import java.security.PrivilegedAction JavaDoc;
22 import sun.security.action.GetBooleanAction;
23 import sun.security.action.GetPropertyAction;
24
25 /**
26  * Utility class for HTML form encoding. This class contains static methods
27  * for converting a String to the <CODE>application/x-www-form-urlencoded</CODE> MIME
28  * format. For more information about HTML form encoding, consult the HTML
29  * <A HREF="http://www.w3.org/TR/html4/">specification</A>.
30  *
31  * <p>
32  * When encoding a String, the following rules apply:
33  *
34  * <p>
35  * <ul>
36  * <li>The alphanumeric characters &quot;<code>a</code>&quot; through
37  * &quot;<code>z</code>&quot;, &quot;<code>A</code>&quot; through
38  * &quot;<code>Z</code>&quot; and &quot;<code>0</code>&quot;
39  * through &quot;<code>9</code>&quot; remain the same.
40  * <li>The special characters &quot;<code>.</code>&quot;,
41  * &quot;<code>-</code>&quot;, &quot;<code>*</code>&quot;, and
42  * &quot;<code>_</code>&quot; remain the same.
43  * <li>The space character &quot;<code>&nbsp;</code>&quot; is
44  * converted into a plus sign &quot;<code>+</code>&quot;.
45  * <li>All other characters are unsafe and are first converted into
46  * one or more bytes using some encoding scheme. Then each byte is
47  * represented by the 3-character string
48  * &quot;<code>%<i>xy</i></code>&quot;, where <i>xy</i> is the
49  * two-digit hexadecimal representation of the byte.
50  * The recommended encoding scheme to use is UTF-8. However,
51  * for compatibility reasons, if an encoding is not specified,
52  * then the default encoding of the platform is used.
53  * </ul>
54  *
55  * <p>
56  * For example using UTF-8 as the encoding scheme the string &quot;The
57  * string &#252;@foo-bar&quot; would get converted to
58  * &quot;The+string+%C3%BC%40foo-bar&quot; because in UTF-8 the character
59  * &#252; is encoded as two bytes C3 (hex) and BC (hex), and the
60  * character @ is encoded as one byte 40 (hex).
61  *
62  * @author Herb Jellinek
63  * @version 1.31, 12/19/06
64  * @since JDK1.0
65  */

66 public class URLEncoder {
67     static BitSet JavaDoc dontNeedEncoding;
68     static final int caseDiff = ('a' - 'A');
69     static String JavaDoc dfltEncName = null;
70
71     static {
72
73     /* The list of characters that are not encoded has been
74      * determined as follows:
75      *
76      * RFC 2396 states:
77      * -----
78      * Data characters that are allowed in a URI but do not have a
79      * reserved purpose are called unreserved. These include upper
80      * and lower case letters, decimal digits, and a limited set of
81      * punctuation marks and symbols.
82      *
83      * unreserved = alphanum | mark
84      *
85      * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
86      *
87      * Unreserved characters can be escaped without changing the
88      * semantics of the URI, but this should not be done unless the
89      * URI is being used in a context that does not allow the
90      * unescaped character to appear.
91      * -----
92      *
93      * It appears that both Netscape and Internet Explorer escape
94      * all special characters from this list with the exception
95      * of "-", "_", ".", "*". While it is not clear why they are
96      * escaping the other characters, perhaps it is safest to
97      * assume that there might be contexts in which the others
98      * are unsafe if not escaped. Therefore, we will use the same
99      * list. It is also noteworthy that this is consistent with
100      * O'Reilly's "HTML: The Definitive Guide" (page 164).
101      *
102      * As a last note, Intenet Explorer does not encode the "@"
103      * character which is clearly not unreserved according to the
104      * RFC. We are being consistent with the RFC in this matter,
105      * as is Netscape.
106      *
107      */

108
109     dontNeedEncoding = new BitSet JavaDoc(256);
110     int i;
111     for (i = 'a'; i <= 'z'; i++) {
112         dontNeedEncoding.set(i);
113     }
114     for (i = 'A'; i <= 'Z'; i++) {
115         dontNeedEncoding.set(i);
116     }
117     for (i = '0'; i <= '9'; i++) {
118         dontNeedEncoding.set(i);
119     }
120     dontNeedEncoding.set(' '); /* encoding a space to a + is done
121                     * in the encode() method */

122     dontNeedEncoding.set('-');
123     dontNeedEncoding.set('_');
124     dontNeedEncoding.set('.');
125     dontNeedEncoding.set('*');
126
127         dfltEncName = (String JavaDoc)AccessController.doPrivileged (
128         new GetPropertyAction("file.encoding")
129         );
130     }
131
132     /**
133      * You can't call the constructor.
134      */

135     private URLEncoder() { }
136
137     /**
138      * Translates a string into <code>x-www-form-urlencoded</code>
139      * format. This method uses the platform's default encoding
140      * as the encoding scheme to obtain the bytes for unsafe characters.
141      *
142      * @param s <code>String</code> to be translated.
143      * @deprecated The resulting string may vary depending on the platform's
144      * default encoding. Instead, use the encode(String,String)
145      * method to specify the encoding.
146      * @return the translated <code>String</code>.
147      */

148     @Deprecated JavaDoc
149     public static String JavaDoc encode(String JavaDoc s) {
150
151     String JavaDoc str = null;
152
153     try {
154         str = encode(s, dfltEncName);
155     } catch (UnsupportedEncodingException JavaDoc e) {
156         // The system should always have the platform default
157
}
158
159     return str;
160     }
161
162     /**
163      * Translates a string into <code>application/x-www-form-urlencoded</code>
164      * format using a specific encoding scheme. This method uses the
165      * supplied encoding scheme to obtain the bytes for unsafe
166      * characters.
167      * <p>
168      * <em><strong>Note:</strong> The <a HREF=
169      * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars">
170      * World Wide Web Consortium Recommendation</a> states that
171      * UTF-8 should be used. Not doing so may introduce
172      * incompatibilites.</em>
173      *
174      * @param s <code>String</code> to be translated.
175      * @param enc The name of a supported
176      * <a HREF="../lang/package-summary.html#charenc">character
177      * encoding</a>.
178      * @return the translated <code>String</code>.
179      * @exception UnsupportedEncodingException
180      * If the named encoding is not supported
181      * @see URLDecoder#decode(java.lang.String, java.lang.String)
182      * @since 1.4
183      */

184     public static String JavaDoc encode(String JavaDoc s, String JavaDoc enc)
185     throws UnsupportedEncodingException JavaDoc {
186
187         boolean needToChange = false;
188         StringBuffer JavaDoc out = new StringBuffer JavaDoc(s.length());
189         Charset JavaDoc charset;
190         CharArrayWriter JavaDoc charArrayWriter = new CharArrayWriter JavaDoc();
191
192         if (enc == null)
193             throw new NullPointerException JavaDoc("charsetName");
194
195         try {
196             charset = Charset.forName(enc);
197         } catch (IllegalCharsetNameException JavaDoc e) {
198             throw new UnsupportedEncodingException JavaDoc(enc);
199         } catch (UnsupportedCharsetException JavaDoc e) {
200             throw new UnsupportedEncodingException JavaDoc(enc);
201         }
202
203         for (int i = 0; i < s.length();) {
204             int c = (int) s.charAt(i);
205             //System.out.println("Examining character: " + c);
206
if (dontNeedEncoding.get(c)) {
207                 if (c == ' ') {
208                     c = '+';
209                     needToChange = true;
210                 }
211                 //System.out.println("Storing: " + c);
212
out.append((char)c);
213                 i++;
214             } else {
215                 // convert to external encoding before hex conversion
216
do {
217                     charArrayWriter.write(c);
218                     /*
219                      * If this character represents the start of a Unicode
220                      * surrogate pair, then pass in two characters. It's not
221                      * clear what should be done if a bytes reserved in the
222                      * surrogate pairs range occurs outside of a legal
223                      * surrogate pair. For now, just treat it as if it were
224                      * any other character.
225                      */

226                     if (c >= 0xD800 && c <= 0xDBFF) {
227                         /*
228                           System.out.println(Integer.toHexString(c)
229                           + " is high surrogate");
230                         */

231                         if ( (i+1) < s.length()) {
232                             int d = (int) s.charAt(i+1);
233                             /*
234                               System.out.println("\tExamining "
235                               + Integer.toHexString(d));
236                             */

237                             if (d >= 0xDC00 && d <= 0xDFFF) {
238                                 /*
239                                   System.out.println("\t"
240                                   + Integer.toHexString(d)
241                                   + " is low surrogate");
242                                 */

243                                 charArrayWriter.write(d);
244                                 i++;
245                             }
246                         }
247                     }
248                     i++;
249                 } while (i < s.length() && !dontNeedEncoding.get((c = (int) s.charAt(i))));
250
251                 charArrayWriter.flush();
252                 String JavaDoc str = new String JavaDoc(charArrayWriter.toCharArray());
253                 byte[] ba = str.getBytes(charset.name());
254                 for (int j = 0; j < ba.length; j++) {
255                     out.append('%');
256                     char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16);
257                     // converting to use uppercase letter as part of
258
// the hex value if ch is a letter.
259
if (Character.isLetter(ch)) {
260                         ch -= caseDiff;
261                     }
262                     out.append(ch);
263                     ch = Character.forDigit(ba[j] & 0xF, 16);
264                     if (Character.isLetter(ch)) {
265                         ch -= caseDiff;
266                     }
267                     out.append(ch);
268                 }
269                 charArrayWriter.reset();
270                 needToChange = true;
271             }
272         }
273
274         return (needToChange? out.toString() : s);
275     }
276 }
277
Popular Tags