Java > Open Source Codes > java > net > IDN


1   /*
2    * @(#)IDN.java 1.3 05/11/17
3    *
4    * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
5    * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6    */
7   package java.net;
8   
9   import java.io.InputStream  ;
10  import java.io.IOException  ;
11  import java.security.AccessController  ;
12  import java.security.PrivilegedAction  ;
13  
14  import sun.net.idn.StringPrep;
15  import sun.net.idn.Punycode;
16  import sun.text.normalizer.UCharacterIterator;
17  
18  /**
19   * Provides methods to convert internationalized domain names (IDNs) between
20   * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.
21   * Internationalized domain names can use characters from the entire range of
22   * Unicode, while traditional domain names are restricted to ASCII characters.
23   * ACE is an encoding of Unicode strings that uses only ASCII characters and
24   * can be used with software (such as the Domain Name System) that only
25   * understands traditional domain names.
26   *
27   * <p>Internationalized domain names are defined in <a HREF="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
28   * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ
29   * <a HREF="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a
30   * profile of <a HREF="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and
31   * <a HREF="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert
32   * domain name string back and forth.
33   *
34   * <p>The behavior of aforementioned conversion process can be adjusted by various flags:
35   *   <ul>
36   *     <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted
37   *         can contain code points that are unassigned in Unicode 3.2, which is the
38   *         Unicode version on which IDN conversion is based. If the flag is not used,
39   *         the presence of such unassigned code points is treated as an error.
40   *     <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a HREF="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a HREF="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.
41   *         It is an error if they don't meet the requirements.
42   *   </ul>
43   * These flags can be logically OR'ed together.
44   *
45   * <p>The security consideration is important with respect to internationalization
46   * domain name support. For example, English domain names may be <i>homographed</i>
47   * - maliciously misspelled by substitution of non-Latin letters.
48   * <a HREF="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>
49   * discusses security issues of IDN support as well as possible solutions.
50   * Applications are responsible for taking adequate security measures when using
51   * international domain names.
52   *
53   * @version 1.3, 05/11/17
54   * @author Edward Wang
55   * @since 1.6
56   *
57   */
58  public final class IDN {
59      /**
60       * Flag to allow processing of unassigned code points
61       */
62      public static final int ALLOW_UNASSIGNED = 0x01;
63      
64      /**
65       * Flag to turn on the check against STD-3 ASCII rules
66       */
67      public static final int USE_STD3_ASCII_RULES = 0x02;
68      
69      
70      /**
71       * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
72       * as defined by the ToASCII operation of <a HREF="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
73       *
74       * <p>ToASCII operation can fail. ToASCII fails if any step of it fails.
75       * If ToASCII operation fails, an IllegalArgumentException will be thrown.
76       * In this case, the input string should not be used in an internationalized domain name.
77       *
78       * <p> A label is an individual part of a domain name. The original ToASCII operation,
79       * as defined in RFC 3490, only operates on a single label. This method can handle
80       * both label and entire domain name, by assuming that labels in a domain name are
81       * always separated by dots. The following characters are recognized as dots:
82       * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
83       * and &#0092;uFF61 (halfwidth ideographic full stop). if dots are
84       * used as label separators, this method also changes all of them to &#0092;u002E (full stop)
85       * in output translated string.
86       *
87       * @param input     the string to be processed
88       * @param flag      process flag; can be 0 or any logical OR of possible flags
89       *
90       * @return          the translated <tt>String</tt>
91       *
92       * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
93       */
94      public static String   toASCII(String   input, int flag)
95      {
96          int p = 0, q = 0;
97          StringBuffer   out = new StringBuffer  ();
98          
99          while (p < input.length()) {
100             q = searchDots(input, p);
101             out.append(toASCIIInternal(input.substring(p, q),  flag));
102             p = q + 1;
103             if (p < input.length()) out.append('.');
104         }
105         
106         return out.toString();
107     }
108     
109     
110     /**
111      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
112      * as defined by the ToASCII operation of <a HREF="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
113      *
114      * <p> This convenience method works as if by invoking the
115      * two-argument counterpart as follows:
116      * <blockquote><tt>
117      * {@link #toASCII(String, int) toASCII}(input,&nbsp;0);
118      * </tt></blockquote>
119      *
120      * @param input     the string to be processed
121      *
122      * @return          the translated <tt>String</tt>
123      *
124      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
125      */
126     public static String   toASCII(String   input) {
127         return toASCII(input, 0);
128     }
129     
130     
131     /**
132      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
133      * as defined by the ToUnicode operation of <a HREF="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
134      *
135      * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.
136      *
137      * <p> A label is an individual part of a domain name. The original ToUnicode operation,
138      * as defined in RFC 3490, only operates on a single label. This method can handle
139      * both label and entire domain name, by assuming that labels in a domain name are
140      * always separated by dots. The following characters are recognized as dots:
141      * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
142      * and &#0092;uFF61 (halfwidth ideographic full stop).
143      *
144      * @param input     the string to be processed
145      * @param flag      process flag; can be 0 or any logical OR of possible flags
146      *
147      * @return          the translated <tt>String</tt>
148      */
149     public static String   toUnicode(String   input, int flag) {
150         int p = 0, q = 0;
151         StringBuffer   out = new StringBuffer  ();
152         
153         while (p < input.length()) {
154             q = searchDots(input, p);
155             out.append(toUnicodeInternal(input.substring(p, q),  flag));
156             p = q + 1;
157             if (p < input.length()) out.append('.');
158         }
159         
160         return out.toString();
161     }
162     
163     
164     /**
165      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
166      * as defined by the ToUnicode operation of <a HREF="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
167      *
168      * <p> This convenience method works as if by invoking the
169      * two-argument counterpart as follows:
170      * <blockquote><tt>
171      * {@link #toUnicode(String, int) toUnicode}(input,&nbsp;0);
172      * </tt></blockquote>
173      *
174      * @param input     the string to be processed
175      *
176      * @return          the translated <tt>String</tt>
177      */
178     public static String   toUnicode(String   input) {
179         return toUnicode(input, 0);
180     }
181 
182     
183     /* ---------------- Private members -------------- */
184     
185     // ACE Prefix is "xn--"
186     private static final String   ACE_PREFIX = "xn--";
187     private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();
188 
189     private static final int MAX_LABEL_LENGTH   = 63;
190 
191     // single instance of nameprep
192     private static StringPrep namePrep = null;
193     
194     static {
195         InputStream   stream = null;
196         
197         try {
198             final String   IDN_PROFILE = "uidna.spp";
199             if (System.getSecurityManager() != null) {
200                 stream = AccessController.doPrivileged(new PrivilegedAction  <InputStream  >() {
201                     public InputStream   run() {
202                         return StringPrep.class.getResourceAsStream(IDN_PROFILE);
203                     }
204                 });
205             } else {
206                 stream = StringPrep.class.getResourceAsStream(IDN_PROFILE);
207             }
208 
209             namePrep = new StringPrep(stream);
210             stream.close();
211         } catch (IOException   e) {
212             // should never reach here
213             assert false;
214         }
215     }
216 
217 
218     /* ---------------- Private operations -------------- */
219     
220     
221     //
222     // to suppress the default zero-argument constructor
223     //
224     private IDN() {}
225     
226     //
227     // toASCII operation; should only apply to a single label
228     //
229     private static String   toASCIIInternal(String   label, int flag)
230     {
231         // step 1
232         // Check if the string contains code points outside the ASCII range 0..0x7c.
233         boolean isASCII  = isAllASCII(label);
234         StringBuffer   dest;
235         
236         // step 2
237         // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
238         if (!isASCII) {
239             UCharacterIterator iter = UCharacterIterator.getInstance(label);
240             try {
241                 dest = namePrep.prepare(iter, flag);
242             } catch (java.text.ParseException   e) {
243                 throw new IllegalArgumentException  (e);
244             }
245         } else {
246             dest = new StringBuffer  (label);
247         }
248         
249         // step 3
250         // Verify the absence of non-LDH ASCII code points
251         //   0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f
252         // Verify the absence of leading and trailing hyphen
253         boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);
254         if (useSTD3ASCIIRules) {
255             for (int i = 0; i < dest.length(); i++) {
256                 int c = dest.charAt(i);
257                 if (!isLDHChar(c)) {
258                     throw new IllegalArgumentException  ("Contains non-LDH characters");
259                 }
260             }
261             
262             if (dest.charAt(0) == '-' || dest.charAt(dest.length() - 1) == '-') {
263                 throw new IllegalArgumentException  ("Has leading or trailing hyphen");
264             }
265         }
266 
267         if (!isASCII) {
268             // step 4
269             // If all code points are inside 0..0x7f, skip to step 8
270             if (!isAllASCII(dest.toString())) {
271                 // step 5
272                 // verify the sequence does not begin with ACE prefix
273                 if(!startsWithACEPrefix(dest)){
274                     
275                     // step 6
276                     // encode the sequence with punycode
277                     try {
278                         dest = Punycode.encode(dest, null);
279                     } catch (java.text.ParseException   e) {
280                         throw new IllegalArgumentException  (e);
281                     }
282                     
283                     dest = toASCIILower(dest);
284                     
285                     // step 7
286                     // prepend the ACE prefix
287                     dest.insert(0, ACE_PREFIX);
288                 } else {
289                     throw new IllegalArgumentException  ("The input starts with the ACE Prefix");
290                 }
291                 
292             }
293         }
294 
295         // step 8
296         // the length must be inside 1..63
297         if(dest.length() > MAX_LABEL_LENGTH){
298             throw new IllegalArgumentException  ("The label in the input is too long");
299         }
300         
301         return dest.toString();
302     }
303     
304     //
305     // toUnicode operation; should only apply to a single label
306     //
307     private static String   toUnicodeInternal(String   label, int flag) {
308         boolean[] caseFlags = null;
309         StringBuffer   dest;
310         
311         // step 1
312         // find out if all the codepoints in input are ASCII
313         boolean isASCII = isAllASCII(label);
314         
315         if(!isASCII){
316             // step 2
317             // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
318             try {
319                 UCharacterIterator iter = UCharacterIterator.getInstance(label);
320                 dest = namePrep.prepare(iter, flag);
321             } catch (Exception   e) {
322                 // toUnicode never fails; if any step fails, return the input string
323                 return label;
324             }
325         } else {
326             dest = new StringBuffer  (label);
327         }
328         
329         // step 3
330         // verify ACE Prefix
331         if(startsWithACEPrefix(dest)) {
332 
333             // step 4
334             // Remove the ACE Prefix
335             String   temp = dest.substring(ACE_PREFIX_LENGTH, dest.length());
336 
337             try {
338                 // step 5
339                 // Decode using punycode
340                 StringBuffer   decodeOut = Punycode.decode(new StringBuffer  (temp), null);
341                 
342                 // step 6
343                 // Apply toASCII
344                 String   toASCIIOut = toASCII(decodeOut.toString(), flag);
345                 
346                 // step 7
347                 // verify
348                 if (toASCIIOut.equalsIgnoreCase(dest.toString())) {
349                     // step 8
350                     // return output of step 5
351                     return decodeOut.toString();
352                 }
353             } catch (Exception   ignored) {
354                 // no-op
355             }
356         }
357         
358         // just return the input
359         return label;
360     }
361 
362     
363     //
364     // LDH stands for "letter/digit/hyphen", with characters restricted to the
365     // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen
366     // <->
367     // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x56..0x60, 0x7B..0x7F
368     //
369     private static boolean isLDHChar(int ch){
370         // high runner case
371         if(ch > 0x007A){
372             return false;
373         }
374         //['-' '0'..'9' 'A'..'Z' 'a'..'z']
375         if((ch == 0x002D) || 
376            (0x0030 <= ch && ch <= 0x0039) ||
377            (0x0041 <= ch && ch <= 0x005A) ||
378            (0x0061 <= ch && ch <= 0x007A)
379           ){
380             return true;
381         }
382         return false;
383     }
384     
385     
386     //
387     // search dots in a string and return the index of that character;
388     // or if there is no dots, return the length of input string
389     // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
390     // and \uFF61 (halfwidth ideographic full stop).
391     //
392     private static int searchDots(String   s, int start) {
393         int i;
394         for (i = start; i < s.length(); i++) {
395             char c = s.charAt(i);
396             if (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61') {
397                 break;
398             }
399         }
400         
401         return i;
402     }
403     
404     
405     //
406     // to check if a string only contains US-ASCII code point
407     //
408     private static boolean isAllASCII(String   input) {
409         boolean isASCII = true;
410         for (int i = 0; i < input.length(); i++) {
411             int c = input.charAt(i);
412             if (c > 0x7F) {
413                 isASCII = false;
414                 break;
415             }
416         }
417         return isASCII;
418     }
419 
420     //
421     // to check if a string starts with ACE-prefix
422     //
423     private static boolean startsWithACEPrefix(StringBuffer   input){
424         boolean startsWithPrefix = true;
425 
426         if(input.length() < ACE_PREFIX_LENGTH){
427             return false;
428         }
429         for(int i = 0; i < ACE_PREFIX_LENGTH; i++){
430             if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){
431                 startsWithPrefix = false;
432             }
433         }
434         return startsWithPrefix;
435     }
436 
437     private static char toASCIILower(char ch){
438         if('A' <= ch && ch <= 'Z'){
439             return (char)(ch + 'a' - 'A');
440         }
441         return ch;
442     }
443 
444     private static StringBuffer   toASCIILower(StringBuffer   input){
445         StringBuffer   dest = new StringBuffer  ();
446         for(int i = 0; i < input.length();i++){
447             dest.append(toASCIILower(input.charAt(i)));
448         }
449         return dest;
450     }
451 }
452
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags