KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > maverick > util > URLUTF8Encoder


1 /*
2  * SSL-Explorer
3  *
4  * Copyright (C) 2003-2006 3SP LTD. All Rights Reserved
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2 of
9  * the License, or (at your option) any later version.
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public
16  * License along with this program; if not, write to the Free Software
17  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18  */

19             
20 package com.maverick.util;
21
22 /**
23  * Provides a method to encode any string into a URL-safe form. Non-ASCII
24  * characters are first encoded as sequences of two or three bytes, using the
25  * UTF-8 algorithm, before being encoded as %HH escapes.
26  */

27 public class URLUTF8Encoder {
28
29     final static String JavaDoc[] hex = { "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
30
"%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
31
"%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
32
"%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
33
"%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
34
"%28", "%29", "%2A", "%2B", "%2C", "%2D", "%2E", "%2F", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
35
"%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
36
"%38", "%39", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
37
"%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
38
"%48", "%49", "%4A", "%4B", "%4C", "%4D", "%4E", "%4F", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
39
"%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
40
"%58", "%59", "%5A", "%5B", "%5C", "%5D", "%5E", "%5F", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
41
"%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
42
"%68", "%69", "%6A", "%6B", "%6C", "%6D", "%6E", "%6F", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
43
"%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
44
"%78", "%79", "%7A", "%7B", "%7C", "%7D", "%7E", "%7F", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
45
"%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
46
"%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
47
"%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
48
"%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
49
"%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
50
"%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
51
"%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
52
"%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
53
"%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
54
"%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
55
"%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
56
"%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
57
"%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
58
"%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
59
"%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
60
"%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF" //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
61
};
62
63     /**
64      * Encode a string to the "x-www-form-urlencoded" form, enhanced with the
65      * UTF-8-in-URL proposal. This is what happens:
66      *
67      * <ul>
68      * <li>
69      * <p>
70      * The ASCII characters 'a' through 'z', 'A' through 'Z', and '0' through
71      * '9' remain the same.
72      *
73      * <li>
74      * <p>
75      * The unreserved characters - _ . ! ~ * ' ( ) remain the same.
76      *
77      * <li>
78      * <p>
79      * The space character ' ' is converted into a plus sign '+'.
80      *
81      * <li>
82      * <p>
83      * All other ASCII characters are converted into the 3-character string
84      * "%xy", where xy is the two-digit hexadecimal representation of the
85      * character code
86      *
87      * <li>
88      * <p>
89      * All non-ASCII characters are encoded in two steps: first to a sequence of
90      * 2 or 3 bytes, using the UTF-8 algorithm; secondly each of these bytes is
91      * encoded as "%xx".
92      * </ul>
93      *
94      * @param s The string to be encoded
95      * @return The encoded string
96      */

97     public static String JavaDoc encode(String JavaDoc s, boolean encodePathSeperator) {
98         StringBuffer JavaDoc sbuf = new StringBuffer JavaDoc();
99         int len = s.length();
100         for (int i = 0; i < len; i++) {
101             int ch = s.charAt(i);
102
103             if (ch == '/' && !encodePathSeperator) {
104                 sbuf.append((char) ch);
105                 continue;
106             }
107             if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
108
sbuf.append((char) ch);
109             } else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
110
sbuf.append((char) ch);
111             } else if ('0' <= ch && ch <= '9') { // '0'..'9'
112
sbuf.append((char) ch);
113             } else if (ch == ' ') { // space
114
sbuf.append("%20"); //$NON-NLS-1$
115
} else if (ch == '-' || ch == '_' // unreserved
116
|| ch == '.' || ch == '!' || ch == '~' || ch == '*' || ch == '\'' || ch == '(' || ch == ')') {
117                 sbuf.append((char) ch);
118             } else if (ch <= 0x007f) { // other ASCII
119
sbuf.append(hex[ch]);
120             } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
121
sbuf.append(hex[0xc0 | (ch >> 6)]);
122                 sbuf.append(hex[0x80 | (ch & 0x3F)]);
123             } else { // 0x7FF < ch <= 0xFFFF
124
sbuf.append(hex[0xe0 | (ch >> 12)]);
125                 sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
126                 sbuf.append(hex[0x80 | (ch & 0x3F)]);
127             }
128         }
129         return sbuf.toString();
130     }
131
132     public static String JavaDoc decode(String JavaDoc s) {
133         StringBuffer JavaDoc sbuf = new StringBuffer JavaDoc();
134         int l = s.length();
135         int ch = -1;
136         int b, sumb = 0;
137         for (int i = 0, more = -1; i < l; i++) {
138             /* Get next byte b from URL segment s */
139             switch (ch = s.charAt(i)) {
140                 case '%':
141                     ch = s.charAt(++i);
142                     int hb = (Character.isDigit((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
143                     ch = s.charAt(++i);
144                     int lb = (Character.isDigit((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
145                     b = (hb << 4) | lb;
146                     break;
147                 case '+':
148                     b = ' ';
149                     break;
150                 default:
151                     b = ch;
152             }
153             /* Decode byte b as UTF-8, sumb collects incomplete chars */
154             if ((b & 0xc0) == 0x80) { // 10xxxxxx (continuation byte)
155
sumb = (sumb << 6) | (b & 0x3f); // Add 6 bits to sumb
156
if (--more == 0)
157                     sbuf.append((char) sumb); // Add char to sbuf
158
} else if ((b & 0x80) == 0x00) { // 0xxxxxxx (yields 7 bits)
159
sbuf.append((char) b); // Store in sbuf
160
} else if ((b & 0xe0) == 0xc0) { // 110xxxxx (yields 5 bits)
161
sumb = b & 0x1f;
162                 more = 1; // Expect 1 more byte
163
} else if ((b & 0xf0) == 0xe0) { // 1110xxxx (yields 4 bits)
164
sumb = b & 0x0f;
165                 more = 2; // Expect 2 more bytes
166
} else if ((b & 0xf8) == 0xf0) { // 11110xxx (yields 3 bits)
167
sumb = b & 0x07;
168                 more = 3; // Expect 3 more bytes
169
} else if ((b & 0xfc) == 0xf8) { // 111110xx (yields 2 bits)
170
sumb = b & 0x03;
171                 more = 4; // Expect 4 more bytes
172
} else /* if ((b & 0xfe) == 0xfc) */{ // 1111110x (yields 1 bit)
173
sumb = b & 0x01;
174                 more = 5; // Expect 5 more bytes
175
}
176             /* We don't test if the UTF-8 encoding is well-formed */
177         }
178         return sbuf.toString();
179     }
180 }
Popular Tags