KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > IDNA


1 /*
2  *******************************************************************************
3  * Copyright (C) 2003-2006, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7 package com.ibm.icu.text;
8
9 import java.io.IOException JavaDoc;
10 import java.io.InputStream JavaDoc;
11 import java.util.MissingResourceException JavaDoc;
12
13 import com.ibm.icu.impl.ICUData;
14 import com.ibm.icu.impl.ICUResourceBundle;
15
16 /**
17  *
18  * IDNA API implements the IDNA protocol as defined in the <a HREF="http://www.ietf.org/rfc/rfc3490.txt">IDNA RFC</a>.
19  * The draft defines 2 operations: ToASCII and ToUnicode. Domain labels
20  * containing non-ASCII code points are required to be processed by
21  * ToASCII operation before passing it to resolver libraries. Domain names
22  * that are obtained from resolver libraries are required to be processed by
23  * ToUnicode operation before displaying the domain name to the user.
24  * IDNA requires that implementations process input strings with
25  * <a HREF="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a>,
26  * which is a profile of <a HREF="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a> ,
27  * and then with <a HREF="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a>.
28  * Implementations of IDNA MUST fully implement Nameprep and Punycode;
29  * neither Nameprep nor Punycode are optional.
30  * The input and output of ToASCII and ToUnicode operations are Unicode
31  * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
32  * multiple times to an input string will yield the same result as applying the operation
33  * once.
34  * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
35  * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
36  *
37  * @author Ram Viswanadha
38  * @stable ICU 2.8
39  */

40 public final class IDNA {
41
42     /* IDNA ACE Prefix is "xn--" */
43     private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
44     private static final int ACE_PREFIX_LENGTH = 4;
45
46     private static final int MAX_LABEL_LENGTH = 63;
47     private static final int HYPHEN = 0x002D;
48     private static final int CAPITAL_A = 0x0041;
49     private static final int CAPITAL_Z = 0x005A;
50     private static final int LOWER_CASE_DELTA = 0x0020;
51     private static final int FULL_STOP = 0x002E;
52
53     /**
54      * Option to prohibit processing of unassigned codepoints in the input and
55      * do not check if the input conforms to STD-3 ASCII rules.
56      *
57      * @see #convertToASCII #convertToUnicode
58      * @stable ICU 2.8
59      */

60     public static final int DEFAULT = 0x0000;
61     /**
62      * Option to allow processing of unassigned codepoints in the input
63      *
64      * @see #convertToASCII #convertToUnicode
65      * @stable ICU 2.8
66      */

67     public static final int ALLOW_UNASSIGNED = 0x0001;
68     /**
69      * Option to check if input conforms to STD-3 ASCII rules
70      *
71      * @see #convertToASCII #convertToUnicode
72      * @stable ICU 2.8
73      */

74     public static final int USE_STD3_RULES = 0x0002;
75     
76     // static final singleton object that is initialized
77
// at class initialization time, hence guaranteed to
78
// be initialized and thread safe
79
private static final IDNA singleton = new IDNA();
80     
81     // The NamePrep profile object
82
private StringPrep namePrep;
83     
84     /* private constructor to prevent construction of the object */
85     private IDNA(){
86         try{
87            InputStream JavaDoc stream = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/uidna.spp");
88            namePrep = new StringPrep(stream);
89            stream.close();
90         }catch (IOException JavaDoc e){
91             throw new MissingResourceException JavaDoc(e.toString(),"","");
92         }
93     }
94     
95     private static boolean startsWithPrefix(StringBuffer JavaDoc src){
96         boolean startsWithPrefix = true;
97
98         if(src.length() < ACE_PREFIX_LENGTH){
99             return false;
100         }
101         for(int i=0; i<ACE_PREFIX_LENGTH;i++){
102             if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
103                 startsWithPrefix = false;
104             }
105         }
106         return startsWithPrefix;
107     }
108
109     private static char toASCIILower(char ch){
110         if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
111             return (char)(ch + LOWER_CASE_DELTA);
112         }
113         return ch;
114     }
115
116     private static StringBuffer JavaDoc toASCIILower(StringBuffer JavaDoc src){
117         StringBuffer JavaDoc dest = new StringBuffer JavaDoc();
118         for(int i=0; i<src.length();i++){
119             dest.append(toASCIILower(src.charAt(i)));
120         }
121         return dest;
122     }
123
124     private static int compareCaseInsensitiveASCII(StringBuffer JavaDoc s1, StringBuffer JavaDoc s2){
125         char c1,c2;
126         int rc;
127         for(int i =0;/* no condition */;i++) {
128             /* If we reach the ends of both strings then they match */
129             if(i == s1.length()) {
130                 return 0;
131             }
132
133             c1 = s1.charAt(i);
134             c2 = s2.charAt(i);
135         
136             /* Case-insensitive comparison */
137             if(c1!=c2) {
138                 rc=toASCIILower(c1)-toASCIILower(c2);
139                 if(rc!=0) {
140                     return rc;
141                 }
142             }
143         }
144     }
145    
146     private static int getSeparatorIndex(char[] src,int start, int limit){
147         for(; start<limit;start++){
148             if(isLabelSeparator(src[start])){
149                 return start;
150             }
151         }
152         // we have not found the separator just return length
153
return start;
154     }
155     
156     /*
157     private static int getSeparatorIndex(UCharacterIterator iter){
158         int currentIndex = iter.getIndex();
159         int separatorIndex = 0;
160         int ch;
161         while((ch=iter.next())!= UCharacterIterator.DONE){
162             if(isLabelSeparator(ch)){
163                 separatorIndex = iter.getIndex();
164                 iter.setIndex(currentIndex);
165                 return separatorIndex;
166             }
167         }
168         // reset index
169         iter.setIndex(currentIndex);
170         // we have not found the separator just return the length
171        
172     }
173     */

174     
175
176     private static boolean isLDHChar(int ch){
177         // high runner case
178
if(ch>0x007A){
179             return false;
180         }
181         //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
182
if( (ch==0x002D) ||
183             (0x0030 <= ch && ch <= 0x0039) ||
184             (0x0041 <= ch && ch <= 0x005A) ||
185             (0x0061 <= ch && ch <= 0x007A)
186           ){
187             return true;
188         }
189         return false;
190     }
191     
192     /**
193      * Ascertain if the given code point is a label separator as
194      * defined by the IDNA RFC
195      *
196      * @param ch The code point to be ascertained
197      * @return true if the char is a label separator
198      * @stable ICU 2.8
199      */

200     private static boolean isLabelSeparator(int ch){
201         switch(ch){
202             case 0x002e:
203             case 0x3002:
204             case 0xFF0E:
205             case 0xFF61:
206                 return true;
207             default:
208                 return false;
209         }
210     }
211        
212     /**
213      * This function implements the ToASCII operation as defined in the IDNA RFC.
214      * This operation is done on <b>single labels</b> before sending it to something that expects
215      * ASCII names. A label is an individual part of a domain name. Labels are usually
216      * separated by dots; e.g." "www.example.com" is composed of 3 labels
217      * "www","example", and "com".
218      *
219      * @param src The input string to be processed
220      * @param options A bit set of options:
221      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
222      * and do not use STD3 ASCII rules
223      * If unassigned code points are found the operation fails with
224      * ParseException.
225      *
226      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
227      * If this option is set, the unassigned code points are in the input
228      * are treated as normal Unicode code points.
229      *
230      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
231      * If this option is set and the input does not satisfy STD3 rules,
232      * the operation will fail with ParseException
233      * @return StringBuffer the converted String
234      * @throws ParseException
235      * @stable ICU 2.8
236      */

237     public static StringBuffer JavaDoc convertToASCII(String JavaDoc src, int options)
238         throws StringPrepParseException{
239         UCharacterIterator iter = UCharacterIterator.getInstance(src);
240         return convertToASCII(iter,options);
241     }
242     
243     /**
244      * This function implements the ToASCII operation as defined in the IDNA RFC.
245      * This operation is done on <b>single labels</b> before sending it to something that expects
246      * ASCII names. A label is an individual part of a domain name. Labels are usually
247      * separated by dots; e.g." "www.example.com" is composed of 3 labels
248      * "www","example", and "com".
249      *
250      * @param src The input string as StringBuffer to be processed
251      * @param options A bit set of options:
252      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
253      * and do not use STD3 ASCII rules
254      * If unassigned code points are found the operation fails with
255      * ParseException.
256      *
257      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
258      * If this option is set, the unassigned code points are in the input
259      * are treated as normal Unicode code points.
260      *
261      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
262      * If this option is set and the input does not satisfy STD3 rules,
263      * the operation will fail with ParseException
264      * @return StringBuffer the converted String
265      * @throws ParseException
266      * @stable ICU 2.8
267      */

268     public static StringBuffer JavaDoc convertToASCII(StringBuffer JavaDoc src, int options)
269         throws StringPrepParseException{
270         UCharacterIterator iter = UCharacterIterator.getInstance(src);
271         return convertToASCII(iter,options);
272     }
273     
274     /**
275      * This function implements the ToASCII operation as defined in the IDNA RFC.
276      * This operation is done on <b>single labels</b> before sending it to something that expects
277      * ASCII names. A label is an individual part of a domain name. Labels are usually
278      * separated by dots; e.g." "www.example.com" is composed of 3 labels
279      * "www","example", and "com".
280      *
281      * @param src The input string as UCharacterIterator to be processed
282      * @param options A bit set of options:
283      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
284      * and do not use STD3 ASCII rules
285      * If unassigned code points are found the operation fails with
286      * ParseException.
287      *
288      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
289      * If this option is set, the unassigned code points are in the input
290      * are treated as normal Unicode code points.
291      *
292      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
293      * If this option is set and the input does not satisfy STD3 rules,
294      * the operation will fail with ParseException
295      * @return StringBuffer the converted String
296      * @throws ParseException
297      * @stable ICU 2.8
298      */

299     public static StringBuffer JavaDoc convertToASCII(UCharacterIterator src, int options)
300                 throws StringPrepParseException{
301         
302         boolean[] caseFlags = null;
303     
304         // the source contains all ascii codepoints
305
boolean srcIsASCII = true;
306         // assume the source contains all LDH codepoints
307
boolean srcIsLDH = true;
308
309         //get the options
310
boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
311         int ch;
312         // step 1
313
while((ch = src.next())!= UCharacterIterator.DONE){
314             if(ch> 0x7f){
315                 srcIsASCII = false;
316             }
317         }
318         int failPos = -1;
319         src.setToStart();
320         StringBuffer JavaDoc processOut = null;
321         // step 2 is performed only if the source contains non ASCII
322
if(!srcIsASCII){
323             // step 2
324
processOut = singleton.namePrep.prepare(src, options);
325         }else{
326             processOut = new StringBuffer JavaDoc(src.getText());
327         }
328         int poLen = processOut.length();
329         
330         if(poLen==0){
331             throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
332         }
333         StringBuffer JavaDoc dest = new StringBuffer JavaDoc();
334         
335         // reset the variable to verify if output of prepare is ASCII or not
336
srcIsASCII = true;
337         
338         // step 3 & 4
339
for(int j=0;j<poLen;j++ ){
340             ch=processOut.charAt(j);
341             if(ch > 0x7F){
342                 srcIsASCII = false;
343             }else if(isLDHChar(ch)==false){
344                 // here we do not assemble surrogates
345
// since we know that LDH code points
346
// are in the ASCII range only
347
srcIsLDH = false;
348                 failPos = j;
349             }
350         }
351     
352         if(useSTD3ASCIIRules == true){
353             // verify 3a and 3b
354
if( srcIsLDH == false /* source contains some non-LDH characters */
355                 || processOut.charAt(0) == HYPHEN
356                 || processOut.charAt(processOut.length()-1) == HYPHEN){
357
358                 /* populate the parseError struct */
359                 if(srcIsLDH==false){
360                      throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
361                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,
362                                               processOut.toString(),
363                                              (failPos>0) ? (failPos-1) : failPos);
364                 }else if(processOut.charAt(0) == HYPHEN){
365                     throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
366                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
367      
368                 }else{
369                      throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
370                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,
371                                               processOut.toString(),
372                                               (poLen>0) ? poLen-1 : poLen);
373
374                 }
375             }
376         }
377         if(srcIsASCII){
378             dest = processOut;
379         }else{
380             // step 5 : verify the sequence does not begin with ACE prefix
381
if(!startsWithPrefix(processOut)){
382
383                 //step 6: encode the sequence with punycode
384
caseFlags = new boolean[poLen];
385
386                 StringBuffer JavaDoc punyout = Punycode.encode(processOut,caseFlags);
387
388                 // convert all codepoints to lower case ASCII
389
StringBuffer JavaDoc lowerOut = toASCIILower(punyout);
390
391                 //Step 7: prepend the ACE prefix
392
dest.append(ACE_PREFIX,0,ACE_PREFIX_LENGTH);
393                 //Step 6: copy the contents in b2 into dest
394
dest.append(lowerOut);
395             }else{
396
397                 throw new StringPrepParseException("The input does not start with the ACE Prefix.",
398                                          StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
399             }
400         }
401         if(dest.length() > MAX_LABEL_LENGTH){
402             throw new StringPrepParseException("The labels in the input are too long. Length > 64.",
403                                      StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
404         }
405         return dest;
406     }
407         
408     /**
409      * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
410      * This operation is done on complete domain names, e.g: "www.example.com".
411      * It is important to note that this operation can fail. If it fails, then the input
412      * domain name cannot be used as an Internationalized Domain Name and the application
413      * should have methods defined to deal with the failure.
414      *
415      * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
416      * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
417      * and then convert. This function does not offer that level of granularity. The options once
418      * set will apply to all labels in the domain name
419      *
420      * @param src The input string as UCharacterIterator to be processed
421      * @param options A bit set of options:
422      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
423      * and do not use STD3 ASCII rules
424      * If unassigned code points are found the operation fails with
425      * ParseException.
426      *
427      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
428      * If this option is set, the unassigned code points are in the input
429      * are treated as normal Unicode code points.
430      *
431      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
432      * If this option is set and the input does not satisfy STD3 rules,
433      * the operation will fail with ParseException
434      * @return StringBuffer the converted String
435      * @throws ParseException
436      * @stable ICU 2.8
437      */

438     public static StringBuffer JavaDoc convertIDNToASCII(UCharacterIterator src, int options)
439             throws StringPrepParseException{
440         return convertIDNToASCII(src.getText(), options);
441     }
442     
443     /**
444      * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
445      * This operation is done on complete domain names, e.g: "www.example.com".
446      * It is important to note that this operation can fail. If it fails, then the input
447      * domain name cannot be used as an Internationalized Domain Name and the application
448      * should have methods defined to deal with the failure.
449      *
450      * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
451      * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
452      * and then convert. This function does not offer that level of granularity. The options once
453      * set will apply to all labels in the domain name
454      *
455      * @param src The input string as a StringBuffer to be processed
456      * @param options A bit set of options:
457      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
458      * and do not use STD3 ASCII rules
459      * If unassigned code points are found the operation fails with
460      * ParseException.
461      *
462      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
463      * If this option is set, the unassigned code points are in the input
464      * are treated as normal Unicode code points.
465      *
466      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
467      * If this option is set and the input does not satisfy STD3 rules,
468      * the operation will fail with ParseException
469      * @return StringBuffer the converted String
470      * @throws ParseException
471      * @stable ICU 2.8
472      */

473     public static StringBuffer JavaDoc convertIDNToASCII(StringBuffer JavaDoc src, int options)
474             throws StringPrepParseException{
475             return convertIDNToASCII(src.toString(), options);
476     }
477     
478     /**
479      * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
480      * This operation is done on complete domain names, e.g: "www.example.com".
481      * It is important to note that this operation can fail. If it fails, then the input
482      * domain name cannot be used as an Internationalized Domain Name and the application
483      * should have methods defined to deal with the failure.
484      *
485      * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
486      * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
487      * and then convert. This function does not offer that level of granularity. The options once
488      * set will apply to all labels in the domain name
489      *
490      * @param src The input string to be processed
491      * @param options A bit set of options:
492      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
493      * and do not use STD3 ASCII rules
494      * If unassigned code points are found the operation fails with
495      * ParseException.
496      *
497      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
498      * If this option is set, the unassigned code points are in the input
499      * are treated as normal Unicode code points.
500      *
501      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
502      * If this option is set and the input does not satisfy STD3 rules,
503      * the operation will fail with ParseException
504      * @return StringBuffer the converted String
505      * @throws ParseException
506      * @stable ICU 2.8
507      */

508     public static StringBuffer JavaDoc convertIDNToASCII(String JavaDoc src,int options)
509             throws StringPrepParseException{
510
511         char[] srcArr = src.toCharArray();
512         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
513         int sepIndex=0;
514         int oldSepIndex=0;
515         for(;;){
516             sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
517             String JavaDoc label = new String JavaDoc(srcArr,oldSepIndex,sepIndex-oldSepIndex);
518             //make sure this is not a root label separator.
519
if(!(label.length()==0 && sepIndex==srcArr.length)){
520                 UCharacterIterator iter = UCharacterIterator.getInstance(label);
521                 result.append(convertToASCII(iter,options));
522             }
523             if(sepIndex==srcArr.length){
524                 break;
525             }
526             
527             // increment the sepIndex to skip past the separator
528
sepIndex++;
529             oldSepIndex = sepIndex;
530             result.append((char)FULL_STOP);
531         }
532         return result;
533     }
534
535     
536     /**
537      * This function implements the ToUnicode operation as defined in the IDNA RFC.
538      * This operation is done on <b>single labels</b> before sending it to something that expects
539      * Unicode names. A label is an individual part of a domain name. Labels are usually
540      * separated by dots; for e.g." "www.example.com" is composed of 3 labels
541      * "www","example", and "com".
542      *
543      * @param src The input string to be processed
544      * @param options A bit set of options:
545      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
546      * and do not use STD3 ASCII rules
547      * If unassigned code points are found the operation fails with
548      * ParseException.
549      *
550      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
551      * If this option is set, the unassigned code points are in the input
552      * are treated as normal Unicode code points.
553      *
554      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
555      * If this option is set and the input does not satisfy STD3 rules,
556      * the operation will fail with ParseException
557      * @return StringBuffer the converted String
558      * @throws ParseException
559      * @stable ICU 2.8
560      */

561     public static StringBuffer JavaDoc convertToUnicode(String JavaDoc src, int options)
562            throws StringPrepParseException{
563         UCharacterIterator iter = UCharacterIterator.getInstance(src);
564         return convertToUnicode(iter,options);
565     }
566     
567     /**
568      * This function implements the ToUnicode operation as defined in the IDNA RFC.
569      * This operation is done on <b>single labels</b> before sending it to something that expects
570      * Unicode names. A label is an individual part of a domain name. Labels are usually
571      * separated by dots; for e.g." "www.example.com" is composed of 3 labels
572      * "www","example", and "com".
573      *
574      * @param src The input string as StringBuffer to be processed
575      * @param options A bit set of options:
576      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
577      * and do not use STD3 ASCII rules
578      * If unassigned code points are found the operation fails with
579      * ParseException.
580      *
581      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
582      * If this option is set, the unassigned code points are in the input
583      * are treated as normal Unicode code points.
584      *
585      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
586      * If this option is set and the input does not satisfy STD3 rules,
587      * the operation will fail with ParseException
588      * @return StringBuffer the converted String
589      * @throws ParseException
590      * @stable ICU 2.8
591      */

592     public static StringBuffer JavaDoc convertToUnicode(StringBuffer JavaDoc src, int options)
593            throws StringPrepParseException{
594         UCharacterIterator iter = UCharacterIterator.getInstance(src);
595         return convertToUnicode(iter,options);
596     }
597        
598     /**
599      * This function implements the ToUnicode operation as defined in the IDNA RFC.
600      * This operation is done on <b>single labels</b> before sending it to something that expects
601      * Unicode names. A label is an individual part of a domain name. Labels are usually
602      * separated by dots; for e.g." "www.example.com" is composed of 3 labels
603      * "www","example", and "com".
604      *
605      * @param src The input string as UCharacterIterator to be processed
606      * @param options A bit set of options:
607      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
608      * and do not use STD3 ASCII rules
609      * If unassigned code points are found the operation fails with
610      * ParseException.
611      *
612      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
613      * If this option is set, the unassigned code points are in the input
614      * are treated as normal Unicode code points.
615      *
616      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
617      * If this option is set and the input does not satisfy STD3 rules,
618      * the operation will fail with ParseException
619      * @return StringBuffer the converted String
620      * @throws ParseException
621      * @stable ICU 2.8
622      */

623     public static StringBuffer JavaDoc convertToUnicode(UCharacterIterator src, int options)
624            throws StringPrepParseException{
625         
626         boolean[] caseFlags = null;
627         
628         // the source contains all ascii codepoints
629
boolean srcIsASCII = true;
630         // assume the source contains all LDH codepoints
631
boolean srcIsLDH = true;
632         
633         //get the options
634
boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
635         
636         int failPos = -1;
637         int ch;
638         int saveIndex = src.getIndex();
639         // step 1: find out if all the codepoints in src are ASCII
640
while((ch=src.next())!= UCharacterIterator.DONE){
641             if(ch>0x7F){
642                 srcIsASCII = false;
643             }else if((srcIsLDH = isLDHChar(ch))==false){
644                 failPos = src.getIndex();
645             }
646         }
647         StringBuffer JavaDoc processOut;
648         
649         if(srcIsASCII == false){
650             try {
651                 // step 2: process the string
652
src.setIndex(saveIndex);
653                 processOut = singleton.namePrep.prepare(src,options);
654             } catch (StringPrepParseException ex) {
655                 return new StringBuffer JavaDoc(src.getText());
656             }
657
658         }else{
659             //just point to source
660
processOut = new StringBuffer JavaDoc(src.getText());
661         }
662         // TODO:
663
// The RFC states that
664
// <quote>
665
// ToUnicode never fails. If any step fails, then the original input
666
// is returned immediately in that step.
667
// </quote>
668

669         //step 3: verify ACE Prefix
670
if(startsWithPrefix(processOut)){
671             StringBuffer JavaDoc decodeOut = null;
672
673             //step 4: Remove the ACE Prefix
674
String JavaDoc temp = processOut.substring(ACE_PREFIX_LENGTH,processOut.length());
675
676             //step 5: Decode using punycode
677
try {
678                 decodeOut = Punycode.decode(new StringBuffer JavaDoc(temp),caseFlags);
679             } catch (StringPrepParseException e) {
680                 decodeOut = null;
681             }
682         
683             //step 6:Apply toASCII
684
if (decodeOut != null) {
685                 StringBuffer JavaDoc toASCIIOut = convertToASCII(decodeOut, options);
686     
687                 //step 7: verify
688
if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
689 // throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
690
// StringPrepParseException.VERIFICATION_ERROR);
691
decodeOut = null;
692                 }
693             }
694
695             //step 8: return output of step 5
696
if (decodeOut != null) {
697                  return decodeOut;
698              }
699         }
700             
701 // }else{
702
// // verify that STD3 ASCII rules are satisfied
703
// if(useSTD3ASCIIRules == true){
704
// if( srcIsLDH == false /* source contains some non-LDH characters */
705
// || processOut.charAt(0) == HYPHEN
706
// || processOut.charAt(processOut.length()-1) == HYPHEN){
707
//
708
// if(srcIsLDH==false){
709
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
710
// StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
711
// (failPos>0) ? (failPos-1) : failPos);
712
// }else if(processOut.charAt(0) == HYPHEN){
713
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
714
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
715
// processOut.toString(),0);
716
//
717
// }else{
718
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
719
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
720
// processOut.toString(),
721
// processOut.length());
722
//
723
// }
724
// }
725
// }
726
// // just return the source
727
// return new StringBuffer(src.getText());
728
// }
729

730         return new StringBuffer JavaDoc(src.getText());
731     }
732     
733     /**
734      * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
735      * This operation is done on complete domain names, e.g: "www.example.com".
736      *
737      * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
738      * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
739      * and then convert. This function does not offer that level of granularity. The options once
740      * set will apply to all labels in the domain name
741      *
742      * @param src The input string as UCharacterIterator to be processed
743      * @param options A bit set of options:
744      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
745      * and do not use STD3 ASCII rules
746      * If unassigned code points are found the operation fails with
747      * ParseException.
748      *
749      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
750      * If this option is set, the unassigned code points are in the input
751      * are treated as normal Unicode code points.
752      *
753      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
754      * If this option is set and the input does not satisfy STD3 rules,
755      * the operation will fail with ParseException
756      * @return StringBuffer the converted String
757      * @throws ParseException
758      * @stable ICU 2.8
759      */

760     public static StringBuffer JavaDoc convertIDNToUnicode(UCharacterIterator src, int options)
761         throws StringPrepParseException{
762         return convertIDNToUnicode(src.getText(), options);
763     }
764     
765     /**
766      * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
767      * This operation is done on complete domain names, e.g: "www.example.com".
768      *
769      * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
770      * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
771      * and then convert. This function does not offer that level of granularity. The options once
772      * set will apply to all labels in the domain name
773      *
774      * @param src The input string as StringBuffer to be processed
775      * @param options A bit set of options:
776      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
777      * and do not use STD3 ASCII rules
778      * If unassigned code points are found the operation fails with
779      * ParseException.
780      *
781      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
782      * If this option is set, the unassigned code points are in the input
783      * are treated as normal Unicode code points.
784      *
785      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
786      * If this option is set and the input does not satisfy STD3 rules,
787      * the operation will fail with ParseException
788      * @return StringBuffer the converted String
789      * @throws ParseException
790      * @stable ICU 2.8
791      */

792     public static StringBuffer JavaDoc convertIDNToUnicode(StringBuffer JavaDoc src, int options)
793         throws StringPrepParseException{
794         return convertIDNToUnicode(src.toString(), options);
795     }
796     
797     /**
798      * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
799      * This operation is done on complete domain names, e.g: "www.example.com".
800      *
801      * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
802      * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
803      * and then convert. This function does not offer that level of granularity. The options once
804      * set will apply to all labels in the domain name
805      *
806      * @param src The input string to be processed
807      * @param options A bit set of options:
808      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
809      * and do not use STD3 ASCII rules
810      * If unassigned code points are found the operation fails with
811      * ParseException.
812      *
813      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
814      * If this option is set, the unassigned code points are in the input
815      * are treated as normal Unicode code points.
816      *
817      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
818      * If this option is set and the input does not satisfy STD3 rules,
819      * the operation will fail with ParseException
820      * @return StringBuffer the converted String
821      * @throws ParseException
822      * @stable ICU 2.8
823      */

824     public static StringBuffer JavaDoc convertIDNToUnicode(String JavaDoc src, int options)
825         throws StringPrepParseException{
826             
827         char[] srcArr = src.toCharArray();
828         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
829         int sepIndex=0;
830         int oldSepIndex=0;
831         for(;;){
832             sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
833             String JavaDoc label = new String JavaDoc(srcArr,oldSepIndex,sepIndex-oldSepIndex);
834             if(label.length()==0 && sepIndex!=srcArr.length ){
835                 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
836             }
837             UCharacterIterator iter = UCharacterIterator.getInstance(label);
838             result.append(convertToUnicode(iter,options));
839             if(sepIndex==srcArr.length){
840                 break;
841             }
842             // increment the sepIndex to skip past the separator
843
sepIndex++;
844             oldSepIndex =sepIndex;
845             result.append((char)FULL_STOP);
846         }
847         return result;
848     }
849     
850     /**
851      * Compare two IDN strings for equivalence.
852      * This function splits the domain names into labels and compares them.
853      * According to IDN RFC, whenever two labels are compared, they are
854      * considered equal if and only if their ASCII forms (obtained by
855      * applying toASCII) match using an case-insensitive ASCII comparison.
856      * Two domain names are considered a match if and only if all labels
857      * match regardless of whether label separators match.
858      *
859      * @param s1 First IDN string as StringBuffer
860      * @param s2 Second IDN string as StringBuffer
861      * @param options A bit set of options:
862      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
863      * and do not use STD3 ASCII rules
864      * If unassigned code points are found the operation fails with
865      * ParseException.
866      *
867      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
868      * If this option is set, the unassigned code points are in the input
869      * are treated as normal Unicode code points.
870      *
871      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
872      * If this option is set and the input does not satisfy STD3 rules,
873      * the operation will fail with ParseException
874      * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
875      * @throws ParseException
876      * @stable ICU 2.8
877      */

878     // TODO: optimize
879
public static int compare(StringBuffer JavaDoc s1, StringBuffer JavaDoc s2, int options)
880         throws StringPrepParseException{
881         if(s1==null || s2 == null){
882             throw new IllegalArgumentException JavaDoc("One of the source buffers is null");
883         }
884         StringBuffer JavaDoc s1Out = convertIDNToASCII(s1.toString(),options);
885         StringBuffer JavaDoc s2Out = convertIDNToASCII(s2.toString(), options);
886         return compareCaseInsensitiveASCII(s1Out,s2Out);
887     }
888     
889     /**
890      * Compare two IDN strings for equivalence.
891      * This function splits the domain names into labels and compares them.
892      * According to IDN RFC, whenever two labels are compared, they are
893      * considered equal if and only if their ASCII forms (obtained by
894      * applying toASCII) match using an case-insensitive ASCII comparison.
895      * Two domain names are considered a match if and only if all labels
896      * match regardless of whether label separators match.
897      *
898      * @param s1 First IDN string
899      * @param s2 Second IDN string
900      * @param options A bit set of options:
901      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
902      * and do not use STD3 ASCII rules
903      * If unassigned code points are found the operation fails with
904      * ParseException.
905      *
906      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
907      * If this option is set, the unassigned code points are in the input
908      * are treated as normal Unicode code points.
909      *
910      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
911      * If this option is set and the input does not satisfy STD3 rules,
912      * the operation will fail with ParseException
913      * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
914      * @throws ParseException
915      * @stable ICU 2.8
916      */

917     // TODO: optimize
918
public static int compare(String JavaDoc s1, String JavaDoc s2, int options)
919         throws StringPrepParseException{
920         if(s1==null || s2 == null){
921             throw new IllegalArgumentException JavaDoc("One of the source buffers is null");
922         }
923         StringBuffer JavaDoc s1Out = convertIDNToASCII(s1, options);
924         StringBuffer JavaDoc s2Out = convertIDNToASCII(s2, options);
925         return compareCaseInsensitiveASCII(s1Out,s2Out);
926     }
927     /**
928      * Compare two IDN strings for equivalence.
929      * This function splits the domain names into labels and compares them.
930      * According to IDN RFC, whenever two labels are compared, they are
931      * considered equal if and only if their ASCII forms (obtained by
932      * applying toASCII) match using an case-insensitive ASCII comparison.
933      * Two domain names are considered a match if and only if all labels
934      * match regardless of whether label separators match.
935      *
936      * @param s1 First IDN string as UCharacterIterator
937      * @param s2 Second IDN string as UCharacterIterator
938      * @param options A bit set of options:
939      * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
940      * and do not use STD3 ASCII rules
941      * If unassigned code points are found the operation fails with
942      * ParseException.
943      *
944      * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
945      * If this option is set, the unassigned code points are in the input
946      * are treated as normal Unicode code points.
947      *
948      * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
949      * If this option is set and the input does not satisfy STD3 rules,
950      * the operation will fail with ParseException
951      * @return 0 if the strings are equal, > 0 if i1 > i2 and < 0 if i1 < i2
952      * @throws ParseException
953      * @stable ICU 2.8
954      */

955     // TODO: optimize
956
public static int compare(UCharacterIterator s1, UCharacterIterator s2, int options)
957         throws StringPrepParseException{
958         if(s1==null || s2 == null){
959             throw new IllegalArgumentException JavaDoc("One of the source buffers is null");
960         }
961         StringBuffer JavaDoc s1Out = convertIDNToASCII(s1.getText(), options);
962         StringBuffer JavaDoc s2Out = convertIDNToASCII(s2.getText(), options);
963         return compareCaseInsensitiveASCII(s1Out,s2Out);
964     }
965 }
966
Popular Tags