UnescapeTransliterator


1   /*
2   **********************************************************************
3   *   Copyright (c) 2001-2004, International Business Machines
4   *   Corporation and others.  All Rights Reserved.
5   **********************************************************************
6   *   Date        Name        Description
7   *   11/19/2001  aliu        Creation.
8   **********************************************************************
9   */
10  package com.ibm.icu.text;
11  import com.ibm.icu.lang.*;
12  
13  /**
14   * A transliterator that converts Unicode escape forms to the
15   * characters they represent.  Escape forms have a prefix, a suffix, a
16   * radix, and minimum and maximum digit counts.
17   *
18   * <p>This class is package private.  It registers several standard
19   * variants with the system which are then accessed via their IDs.
20   *
21   * @author Alan Liu
22   */
23  class UnescapeTransliterator extends Transliterator {
24  
25      /**
26       * The encoded pattern specification.  The pattern consists of
27       * zero or more forms.  Each form consists of a prefix, suffix,
28       * radix, minimum digit count, and maximum digit count.  These
29       * values are stored as a five character header.  That is, their
30       * numeric values are cast to 16-bit characters and stored in the
31       * string.  Following these five characters, the prefix
32       * characters, then suffix characters are stored.  Each form thus
33       * takes n+5 characters, where n is the total length of the prefix
34       * and suffix.  The end is marked by a header of length one
35       * consisting of the character END.
36       */
37      private char spec[];
38  
39      /**
40       * Special character marking the end of the spec[] array.
41       */
42      private static final char END = 0xFFFF;
43  
44      /**
45       * Registers standard variants with the system.  Called by
46       * Transliterator during initialization.
47       */
48      static void register() {
49          // Unicode: "U+10FFFF" hex, min=4, max=6
50          Transliterator.registerFactory("Hex-Any/Unicode", new Transliterator.Factory() {
51              public Transliterator getInstance(String   ID) {
52                  return new UnescapeTransliterator("Hex-Any/Unicode", new char[] {
53                      2, 0, 16, 4, 6, 'U', '+',
54                      END
55                  });
56              }
57          });
58          
59          // Java: "\\uFFFF" hex, min=4, max=4
60          Transliterator.registerFactory("Hex-Any/Java", new Transliterator.Factory() {
61              public Transliterator getInstance(String   ID) {
62                  return new UnescapeTransliterator("Hex-Any/Java", new char[] {
63                      2, 0, 16, 4, 4, '\\', 'u',
64                      END
65                  });
66              }
67          });
68          
69          // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
70          Transliterator.registerFactory("Hex-Any/C", new Transliterator.Factory() {
71              public Transliterator getInstance(String   ID) {
72                  return new UnescapeTransliterator("Hex-Any/C", new char[] {
73                      2, 0, 16, 4, 4, '\\', 'u',
74                      2, 0, 16, 8, 8, '\\', 'U',
75                      END
76                  });
77              }
78          });
79          
80          // XML: "&#x10FFFF;" hex, min=1, max=6
81          Transliterator.registerFactory("Hex-Any/XML", new Transliterator.Factory() {
82              public Transliterator getInstance(String   ID) {
83                  return new UnescapeTransliterator("Hex-Any/XML", new char[] {
84                      3, 1, 16, 1, 6, '&', '#', 'x', ';',
85                      END
86                  });
87              }
88          });
89  
90          // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any")
91          Transliterator.registerFactory("Hex-Any/XML10", new Transliterator.Factory() {
92              public Transliterator getInstance(String   ID) {
93                  return new UnescapeTransliterator("Hex-Any/XML10", new char[] {
94                      2, 1, 10, 1, 7, '&', '#', ';',
95                      END
96                  });
97              }
98          });
99  
100         // Perl: "\\x{263A}" hex, min=1, max=6
101         Transliterator.registerFactory("Hex-Any/Perl", new Transliterator.Factory() {
102             public Transliterator getInstance(String   ID) {
103                 return new UnescapeTransliterator("Hex-Any/Perl", new char[] {
104                     3, 1, 16, 1, 6, '\\', 'x', '{', '}',
105                     END
106                 });
107             }
108         });
109 
110         // All: Java, C, Perl, XML, XML10, Unicode
111         Transliterator.registerFactory("Hex-Any", new Transliterator.Factory() {
112             public Transliterator getInstance(String   ID) {
113                 return new UnescapeTransliterator("Hex-Any", new char[] {
114                     2, 0, 16, 4, 6, 'U', '+',            // Unicode
115                     2, 0, 16, 4, 4, '\\', 'u',           // Java
116                     2, 0, 16, 8, 8, '\\', 'U',           // C (surrogates)
117                     3, 1, 16, 1, 6, '&', '#', 'x', ';',  // XML
118                     2, 1, 10, 1, 7, '&', '#', ';',       // XML10
119                     3, 1, 16, 1, 6, '\\', 'x', '{', '}', // Perl
120                     END
121                 });
122             }
123         });
124     }
125 
126     /**
127      * Package private constructor.  Takes the encoded spec array.
128      */
129     UnescapeTransliterator(String   ID, char spec[]) {
130         super(ID, null);
131         this.spec = spec;
132     }
133 
134     /**
135      * Implements {@link Transliterator#handleTransliterate}.
136      */
137     protected void handleTransliterate(Replaceable text,
138                                        Position pos, boolean isIncremental) {
139         int start = pos.start;
140         int limit = pos.limit;
141         int i, j, ipat;
142 
143       loop:
144         while (start < limit) {
145             // Loop over the forms in spec[].  Exit this loop when we
146             // match one of the specs.  Exit the outer loop if a
147             // partial match is detected and isIncremental is true.
148             for (j=0, ipat=0; spec[ipat] != END; ++j) {
149 
150                 // Read the header
151                 int prefixLen = spec[ipat++];
152                 int suffixLen = spec[ipat++];
153                 int radix     = spec[ipat++];
154                 int minDigits = spec[ipat++];
155                 int maxDigits = spec[ipat++];
156 
157                 // s is a copy of start that is advanced over the
158                 // characters as we parse them.
159                 int s = start;
160                 boolean match = true;
161 
162                 for (i=0; i<prefixLen; ++i) {
163                     if (s >= limit) {
164                         if (i > 0) {
165                             // We've already matched a character.  This is
166                             // a partial match, so we return if in
167                             // incremental mode.  In non-incremental mode,
168                             // go to the next spec.
169                             if (isIncremental) {
170                                 break loop;
171                             }
172                             match = false;
173                             break;
174                         }
175                     }
176                     char c = text.charAt(s++);
177                     if (c != spec[ipat + i]) {
178                         match = false;
179                         break;
180                     }
181                 }
182 
183                 if (match) {
184                     int u = 0;
185                     int digitCount = 0;
186                     for (;;) {
187                         if (s >= limit) {
188                             // Check for partial match in incremental mode.
189                             if (s > start && isIncremental) {
190                                 break loop;
191                             }
192                             break;
193                         }
194                         int ch = text.char32At(s);
195                         int digit = UCharacter.digit(ch, radix);
196                         if (digit < 0) {
197                             break;
198                         }
199                         s += UTF16.getCharCount(ch);
200                         u = (u * radix) + digit;
201                         if (++digitCount == maxDigits) {
202                             break;
203                         }
204                     }
205 
206                     match = (digitCount >= minDigits);
207 
208                     if (match) {
209                         for (i=0; i<suffixLen; ++i) {
210                             if (s >= limit) {
211                                 // Check for partial match in incremental mode.
212                                 if (s > start && isIncremental) {
213                                     break loop;
214                                 }
215                                 match = false;
216                                 break;
217                             }
218                             char c = text.charAt(s++);
219                             if (c != spec[ipat + prefixLen + i]) {
220                                 match = false;
221                                 break;
222                             }
223                         }
224 
225                         if (match) {
226                             // At this point, we have a match
227                             String   str = UTF16.valueOf(u);
228                             text.replace(start, s, str);
229                             limit -= s - start - str.length();
230                             // The following break statement leaves the
231                             // loop that is traversing the forms in
232                             // spec[].  We then parse the next input
233                             // character.
234                             break;
235                         }
236                     }
237                 }
238 
239                 ipat += prefixLen + suffixLen;
240             }
241 
242             if (start < limit) {
243                 start += UTF16.getCharCount(text.char32At(start));
244             }
245         }
246 
247         pos.contextLimit += limit - pos.limit;
248         pos.limit = limit;
249         pos.start = start;
250     }
251 }
252
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags