Normalizer


1   package net.sf.saxon.codenorm;
2   
3   import net.sf.saxon.om.XMLChar;
4   
5   /**
6    * Implements Unicode Normalization Forms C, D, KC, KD.<br>
7    * Copyright (c) 1991-2005 Unicode, Inc.
8    * For terms of use, see http://www.unicode.org/terms_of_use.html
9    * For documentation, see UAX#15.<br>
10   * The Unicode Consortium makes no expressed or implied warranty of any
11   * kind, and assumes no liability for errors or omissions.
12   * No liability is assumed for incidental and consequential damages
13   * in connection with or arising out of the use of the information here.
14   * @author Mark Davis
15   * Updates for supplementary code points: Vladimir Weinstein & Markus Scherer
16   * Modified to remove dependency on ICU code: Michael Kay
17   */
18  
19  public class Normalizer {
20  
21      /**
22       * Create a normalizer for a given form.
23       */
24      public Normalizer(byte form) {
25          this.form = form;
26          if (data == null) {
27              data = UnicodeDataParser.build(); // load 1st time
28          }
29      }
30  
31      /**
32      * Masks for the form selector
33      */
34      static final byte
35          COMPATIBILITY_MASK = 1,
36          COMPOSITION_MASK = 2;
37  
38      /**
39      * Normalization Form Selector
40      */
41      public static final byte
42          D = 0 ,
43          C = COMPOSITION_MASK,
44          KD = COMPATIBILITY_MASK,
45          KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
46  
47      /**
48      * Normalizes text according to the chosen form,
49      * replacing contents of the target buffer.
50      * @param   source      the original text, unnormalized
51      * @param   target      the resulting normalized text
52      */
53      public StringBuffer   normalize(String   source, StringBuffer   target) {
54  
55          // First decompose the source into target,
56          // then compose if the form requires.
57  
58          if (source.length() != 0) {
59              internalDecompose(source, target);
60              if ((form & COMPOSITION_MASK) != 0) {
61                  internalCompose(target);
62              }
63          }
64          return target;
65      }
66  
67      /**
68      * Normalizes text according to the chosen form
69      * @param   source      the original text, unnormalized
70      * @return  target      the resulting normalized text
71      */
72      public String   normalize(String   source) {
73          return normalize(source, new StringBuffer  (source.length()+8)).toString();
74      }
75  
76      // ======================================
77      //                  PRIVATES
78      // ======================================
79  
80      /**
81       * The current form.
82       */
83      private byte form;
84  
85      /**
86      * Decomposes text, either canonical or compatibility,
87      * replacing contents of the target buffer.
88  //    * @param   form        the normalization form. If COMPATIBILITY_MASK
89  //    *                      bit is on in this byte, then selects the recursive
90  //    *                      compatibility decomposition, otherwise selects
91  //    *                      the recursive canonical decomposition.
92      * @param   source      the original text, unnormalized
93      * @param   target      the resulting normalized text
94      */
95      private void internalDecompose(String   source, StringBuffer   target) {
96          StringBuffer   buffer = new StringBuffer  (8);
97          boolean canonical = (form & COMPATIBILITY_MASK) == 0;
98          int ch32;
99          //for (int i = 0; i < source.length(); i += (ch32<65536 ? 1 : 2)) {
100         for (int i = 0; i < source.length();) {
101             buffer.setLength(0);
102             //ch32 = UTF16.charAt(source, i);
103             ch32 = source.charAt(i++);
104             if (XMLChar.isHighSurrogate(ch32)) {
105                 char low = source.charAt(i++);
106                 ch32 = XMLChar.supplemental((char)ch32, low);
107             }
108             data.getRecursiveDecomposition(canonical, ch32, buffer);
109 
110             // add all of the characters in the decomposition.
111             // (may be just the original character, if there was
112             // no decomposition mapping)
113 
114             int ch;
115             //for (int j = 0; j < buffer.length(); j += (ch<65536 ? 1 : 2)) {
116             for (int j = 0; j < buffer.length();) {
117                 //ch = UTF16.charAt(buffer, j);
118                 ch = buffer.charAt(j++);
119                 if (XMLChar.isHighSurrogate(ch32)) {
120                     char low = buffer.charAt(j++);
121                     ch = XMLChar.supplemental((char)ch, low);
122                 }
123                 int chClass = data.getCanonicalClass(ch);
124                 int k = target.length(); // insertion point
125                 if (chClass != 0) {
126 
127                     // bubble-sort combining marks as necessary
128 
129                     int ch2;
130                     while (k > 0) {
131                         ch2 = target.charAt(k-1);
132                         if (XMLChar.isSurrogate(ch2)) {
133                             k--;
134                             char high = buffer.charAt(k-1);
135                             ch2 = XMLChar.supplemental(high, (char)ch2);
136                         }
137                         if (data.getCanonicalClass(ch2) <= chClass) break;
138                         k--;
139                     }
140 //                    for (; k > 0; k -= (ch2<65536 ? 1 : 2)) {
141 //                        ch2 = UTF16.charAt(target, k-1);
142 //                        if (data.getCanonicalClass(ch2) <= chClass) break;
143 //                    }
144                 }
145                 if (ch < 65536) {
146                     target.insert(k, (char)ch);
147                 } else {
148                     String   s = "" + XMLChar.highSurrogate(ch) + XMLChar.lowSurrogate(ch);
149                     target.insert(k, s);
150                 }
151                 //target.insert(k, UTF16.valueOf(ch));
152             }
153         }
154     }
155 
156     /**
157     * Composes text in place. Target must already
158     * have been decomposed.
159     * @param   target      input: decomposed text.
160     *                      output: the resulting normalized text.
161     */
162     private void internalCompose(StringBuffer   target) {
163 
164         int starterPos = 0;
165         //int starterCh = UTF16.charAt(target,0);
166         //int compPos = (starterCh<65536 ? 1 : 2); // length of last composition
167         int starterCh = target.charAt(0);
168         int compPos = 1;
169         if (XMLChar.isHighSurrogate(starterCh)) {
170             starterCh = XMLChar.supplemental((char)starterCh, target.charAt(1));
171             compPos++;
172         }
173         int lastClass = data.getCanonicalClass(starterCh);
174         if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
175         int oldLen = target.length();
176 
177         // Loop on the decomposed characters, combining where possible
178 
179         int ch;
180         //for (int decompPos = compPos; decompPos < target.length(); decompPos += (ch<65536 ? 1 : 2)) {
181         for (int decompPos = compPos; decompPos < target.length();) {
182             ch = target.charAt(decompPos++);
183             if (XMLChar.isHighSurrogate(ch)) {
184                 ch = XMLChar.supplemental((char)ch, target.charAt(decompPos++));
185             }
186             //ch = UTF16.charAt(target, decompPos);
187             int chClass = data.getCanonicalClass(ch);
188             int composite = data.getPairwiseComposition(starterCh, ch);
189             if (composite != NormalizerData.NOT_COMPOSITE && (lastClass < chClass || lastClass == 0)) {
190                 setCharAt(target, starterPos, composite);
191                 // we know that we will only be replacing non-supplementaries by non-supplementaries
192                 // so we don't have to adjust the decompPos
193                 starterCh = composite;
194             } else {
195                 if (chClass == 0) {
196                     starterPos = compPos;
197                     starterCh  = ch;
198                 }
199                 lastClass = chClass;
200                 setCharAt(target, compPos, ch);
201                 if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
202                     decompPos += target.length() - oldLen;
203                     oldLen = target.length();
204                 }
205                 compPos += (ch<65536 ? 1 : 2);
206             }
207         }
208         target.setLength(compPos);
209     }
210 
211     /**
212      * Set the 32-bit character at a particular 16-bit offset in a string buffer,
213      * replacing the previous character at that position, and taking account of the
214      * fact that either, both, or neither of the characters might be a surrogate pair.
215      */
216 
217     private static void setCharAt(StringBuffer   target, int offset, int ch32) {
218         if (ch32 < 65536) {
219             if (XMLChar.isHighSurrogate(target.charAt(offset))) {
220                 target.setCharAt(offset, (char)ch32);
221                 target.deleteCharAt(offset+1);
222             } else {
223                 target.setCharAt(offset, (char)ch32);
224             }
225         } else {
226             if (XMLChar.isHighSurrogate(target.charAt(offset))) {
227                 target.setCharAt(offset, XMLChar.highSurrogate(ch32));
228                 target.setCharAt(offset+1, XMLChar.lowSurrogate(ch32));
229             } else {
230                 target.setCharAt(offset, XMLChar.highSurrogate(ch32));
231                 target.insert(offset+1, XMLChar.lowSurrogate(ch32));
232             }
233         }
234     }
235 
236     /**
237     * Contains normalization data from the Unicode Character Database.
238     * use false for the minimal set, true for the real set.
239     */
240     private static NormalizerData data = null;
241 
242     /**
243     * Just accessible for testing.
244     */
245     boolean getExcluded (char ch) {
246         return data.getExcluded(ch);
247     }
248 
249     /**
250     * Just accessible for testing.
251     */
252     String   getRawDecompositionMapping (char ch) {
253         return data.getRawDecompositionMapping(ch);
254     }
255 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags