KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > java > text > RBTableBuilder


1 /*
2  * @(#)RBTableBuilder.java 1.12 03/12/19
3  *
4  * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
5  * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6  */

7
8 /*
9  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
10  * (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved
11  *
12  * The original version of this source code and documentation is copyrighted
13  * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
14  * materials are provided under terms of a License Agreement between Taligent
15  * and Sun. This technology is protected by multiple US and International
16  * patents. This notice and attribution to Taligent may not be removed.
17  * Taligent is a registered trademark of Taligent, Inc.
18  *
19  */

20
21 package java.text;
22
23 import java.util.Vector JavaDoc;
24 import sun.text.UCompactIntArray;
25 import sun.text.IntHashtable;
26 import sun.text.Normalizer;
27 import sun.text.NormalizerImpl;
28 import sun.text.ComposedCharIter;
29 import sun.text.NormalizerUtilities;
30
31 /**
32  * This class contains all the code to parse a RuleBasedCollator pattern
33  * and build a RBCollationTables object from it. A particular instance
34  * of tis class exists only during the actual build process-- once an
35  * RBCollationTables object has been built, the RBTableBuilder object
36  * goes away. This object carries all of the state which is only needed
37  * during the build process, plus a "shadow" copy of all of the state
38  * that will go into the tables object itself. This object communicates
39  * with RBCollationTables through a separate class, RBCollationTables.BuildAPI,
40  * this is an inner class of RBCollationTables and provides a separate
41  * private API for communication with RBTableBuilder.
42  * This class isn't just an inner class of RBCollationTables itself because
43  * of its large size. For source-code readability, it seemed better for the
44  * builder to have its own source file.
45  */

46 final class RBTableBuilder {
47
48     public RBTableBuilder(RBCollationTables.BuildAPI JavaDoc tables) {
49         this.tables = tables;
50     }
51
52     /**
53      * Create a table-based collation object with the given rules.
54      * This is the main function that actually builds the tables and
55      * stores them back in the RBCollationTables object. It is called
56      * ONLY by the RBCollationTables constructor.
57      * @see java.util.RuleBasedCollator#RuleBasedCollator
58      * @exception ParseException If the rules format is incorrect.
59      */

60
61     public void build(String JavaDoc pattern, int decmp) throws ParseException JavaDoc
62     {
63         boolean isSource = true;
64         int i = 0;
65         String JavaDoc expChars;
66         String JavaDoc groupChars;
67         if (pattern.length() == 0)
68             throw new ParseException JavaDoc("Build rules empty.", 0);
69
70         // This array maps Unicode characters to their collation ordering
71
mapping = new UCompactIntArray((int)RBCollationTables.UNMAPPED);
72         // Normalize the build rules. Find occurances of all decomposed characters
73
// and normalize the rules before feeding into the builder. By "normalize",
74
// we mean that all precomposed Unicode characters must be converted into
75
// a base character and one or more combining characters (such as accents).
76
// When there are multiple combining characters attached to a base character,
77
// the combining characters must be in their canonical order
78
//
79
// sherman/Note:
80
//(1)decmp will be NO_DECOMPOSITION only in ko locale to prevent decompose
81
//hangual syllables to jamos, so we can actually just call decompose with
82
//normalizer's IGNORE_HANGUL option turned on
83
//
84
//(2)just call the "special version" in NormalizerImpl directly
85
//pattern = Normalizer.decompose(pattern, false, Normalizer.IGNORE_HANGUL, true);
86
//
87
//Normalizer.Mode mode = NormalizerUtilities.toNormalizerMode(decmp);
88
//pattern = Normalizer.normalize(pattern, mode, 0, true);
89

90         pattern = NormalizerImpl.canonicalDecomposeWithSingleQuotation(pattern);
91
92         // Build the merged collation entries
93
// Since rules can be specified in any order in the string
94
// (e.g. "c , C < d , D < e , E .... C < CH")
95
// this splits all of the rules in the string out into separate
96
// objects and then sorts them. In the above example, it merges the
97
// "C < CH" rule in just before the "C < D" rule.
98
//
99

100         mPattern = new MergeCollation JavaDoc(pattern);
101
102         int order = 0;
103
104         // Now walk though each entry and add it to my own tables
105
for (i = 0; i < mPattern.getCount(); ++i)
106         {
107             PatternEntry JavaDoc entry = mPattern.getItemAt(i);
108             if (entry != null) {
109                 groupChars = entry.getChars();
110                 if (groupChars.length() > 1) {
111             switch(groupChars.charAt(groupChars.length()-1)) {
112             case '@':
113             frenchSec = true;
114             groupChars = groupChars.substring(0, groupChars.length()-1);
115             break;
116             case '!':
117             seAsianSwapping = true;
118             groupChars = groupChars.substring(0, groupChars.length()-1);
119             break;
120             }
121                 }
122
123                 order = increment(entry.getStrength(), order);
124                 expChars = entry.getExtension();
125
126                 if (expChars.length() != 0) {
127                     addExpandOrder(groupChars, expChars, order);
128                 } else if (groupChars.length() > 1) {
129                     char ch = groupChars.charAt(0);
130                     if (Character.isHighSurrogate(ch) && groupChars.length() == 2) {
131                         addOrder(Character.toCodePoint(ch, groupChars.charAt(1)), order);
132                     } else {
133                         addContractOrder(groupChars, order);
134                     }
135                 } else {
136                     char ch = groupChars.charAt(0);
137                     addOrder(ch, order);
138                 }
139             }
140         }
141     addComposedChars();
142
143         commit();
144         mapping.compact();
145         /*
146         System.out.println("mappingSize=" + mapping.getKSize());
147         for (int j = 0; j < 0xffff; j++) {
148             int value = mapping.elementAt(j);
149             if (value != RBCollationTables.UNMAPPED)
150                 System.out.println("index=" + Integer.toString(j, 16)
151                            + ", value=" + Integer.toString(value, 16));
152         }
153     */

154         tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable,
155                     contractFlags, maxSecOrder, maxTerOrder);
156     }
157
158     /** Add expanding entries for pre-composed unicode characters so that this
159      * collator can be used reasonably well with decomposition turned off.
160      */

161     private void addComposedChars() throws ParseException JavaDoc {
162         // Iterate through all of the pre-composed characters in Unicode
163
ComposedCharIter iter = new ComposedCharIter();
164         int c;
165         while ((c = iter.next()) != ComposedCharIter.DONE) {
166             if (getCharOrder(c) == RBCollationTables.UNMAPPED) {
167                 //
168
// We don't already have an ordering for this pre-composed character.
169
//
170
// First, see if the decomposed string is already in our
171
// tables as a single contracting-string ordering.
172
// If so, just map the precomposed character to that order.
173
//
174
// TODO: What we should really be doing here is trying to find the
175
// longest initial substring of the decomposition that is present
176
// in the tables as a contracting character sequence, and find its
177
// ordering. Then do this recursively with the remaining chars
178
// so that we build a list of orderings, and add that list to
179
// the expansion table.
180
// That would be more correct but also significantly slower, so
181
// I'm not totally sure it's worth doing.
182
//
183
String JavaDoc s = iter.decomposition();
184
185                 //sherman/Note: if this is 1 character decomposed string, the
186
//only thing need to do is to check if this decomposed character
187
//has an entry in our order table, this order is not necessary
188
//to be a contraction order, if it does have one, add an entry
189
//for the precomposed character by using the same order, the
190
//previous impl unnecessarily adds a single character expansion
191
//entry.
192
if (s.length() == 1) {
193                     int order = getCharOrder(s.charAt(0));
194                     if (order != RBCollationTables.UNMAPPED) {
195                         addOrder(c, order);
196                     }
197                     continue;
198                 } else if (s.length() == 2) {
199                     char ch0 = s.charAt(0);
200                     if (Character.isHighSurrogate(ch0)) {
201                         int order = getCharOrder(s.codePointAt(0));
202                         if (order != RBCollationTables.UNMAPPED) {
203                             addOrder(c, order);
204                         }
205                         continue;
206                     }
207                 }
208                 int contractOrder = getContractOrder(s);
209                 if (contractOrder != RBCollationTables.UNMAPPED) {
210                     addOrder(c, contractOrder);
211                 } else {
212                     //
213
// We don't have a contracting ordering for the entire string
214
// that results from the decomposition, but if we have orders
215
// for each individual character, we can add an expanding
216
// table entry for the pre-composed character
217
//
218
boolean allThere = true;
219                     for (int i = 0; i < s.length(); i++) {
220                         if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) {
221                             allThere = false;
222                             break;
223                         }
224                     }
225                     if (allThere) {
226             addExpandOrder(c, s, RBCollationTables.UNMAPPED);
227                     }
228                 }
229             }
230         }
231     }
232
233     /**
234      * Look up for unmapped values in the expanded character table.
235      *
236      * When the expanding character tables are built by addExpandOrder,
237      * it doesn't know what the final ordering of each character
238      * in the expansion will be. Instead, it just puts the raw character
239      * code into the table, adding CHARINDEX as a flag. Now that we've
240      * finished building the mapping table, we can go back and look up
241      * that character to see what its real collation order is and
242      * stick that into the expansion table. That lets us avoid doing
243      * a two-stage lookup later.
244      */

245     private final void commit()
246     {
247         if (expandTable != null) {
248             for (int i = 0; i < expandTable.size(); i++) {
249                 int[] valueList = (int [])expandTable.elementAt(i);
250                 for (int j = 0; j < valueList.length; j++) {
251                     int order = valueList[j];
252                     if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) {
253                         // found a expanding character that isn't filled in yet
254
int ch = order - CHARINDEX;
255
256                         // Get the real values for the non-filled entry
257
int realValue = getCharOrder(ch);
258
259                         if (realValue == RBCollationTables.UNMAPPED) {
260                             // The real value is still unmapped, maybe it's ignorable
261
valueList[j] = IGNORABLEMASK & ch;
262                         } else {
263                             // just fill in the value
264
valueList[j] = realValue;
265                         }
266                     }
267                 }
268             }
269         }
270     }
271     /**
272      * Increment of the last order based on the comparison level.
273      */

274     private final int increment(int aStrength, int lastValue)
275     {
276         switch(aStrength)
277         {
278         case Collator.PRIMARY:
279             // increment priamry order and mask off secondary and tertiary difference
280
lastValue += PRIMARYORDERINCREMENT;
281             lastValue &= RBCollationTables.PRIMARYORDERMASK;
282             isOverIgnore = true;
283             break;
284         case Collator.SECONDARY:
285             // increment secondary order and mask off tertiary difference
286
lastValue += SECONDARYORDERINCREMENT;
287             lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY;
288             // record max # of ignorable chars with secondary difference
289
if (!isOverIgnore)
290                 maxSecOrder++;
291             break;
292         case Collator.TERTIARY:
293             // increment tertiary order
294
lastValue += TERTIARYORDERINCREMENT;
295             // record max # of ignorable chars with tertiary difference
296
if (!isOverIgnore)
297                 maxTerOrder++;
298             break;
299         }
300         return lastValue;
301     }
302
303     /**
304      * Adds a character and its designated order into the collation table.
305      */

306     private final void addOrder(int ch, int anOrder)
307     {
308         // See if the char already has an order in the mapping table
309
int order = mapping.elementAt(ch);
310
311         if (order >= RBCollationTables.CONTRACTCHARINDEX) {
312             // There's already an entry for this character that points to a contracting
313
// character table. Instead of adding the character directly to the mapping
314
// table, we must add it to the contract table instead.
315
int length = 1;
316             if (Character.isSupplementaryCodePoint(ch)) {
317                 length = Character.toChars(ch, keyBuf, 0);
318             } else {
319                 keyBuf[0] = (char)ch;
320             }
321             addContractOrder(new String JavaDoc(keyBuf, 0, length), anOrder);
322         } else {
323             // add the entry to the mapping table,
324
// the same later entry replaces the previous one
325
mapping.setElementAt(ch, anOrder);
326         }
327     }
328
329     private final void addContractOrder(String JavaDoc groupChars, int anOrder) {
330         addContractOrder(groupChars, anOrder, true);
331     }
332
333     /**
334      * Adds the contracting string into the collation table.
335      */

336     private final void addContractOrder(String JavaDoc groupChars, int anOrder,
337                                           boolean fwd)
338     {
339         if (contractTable == null) {
340             contractTable = new Vector JavaDoc(INITIALTABLESIZE);
341         }
342
343     //initial character
344
int ch = groupChars.codePointAt(0);
345     /*
346         char ch0 = groupChars.charAt(0);
347         int ch = Character.isHighSurrogate(ch0)?
348       Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;
349       */

350         // See if the initial character of the string already has a contract table.
351
int entry = mapping.elementAt(ch);
352         Vector JavaDoc entryTable = getContractValuesImpl(entry - RBCollationTables.CONTRACTCHARINDEX);
353
354         if (entryTable == null) {
355             // We need to create a new table of contract entries for this base char
356
int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size();
357             entryTable = new Vector JavaDoc(INITIALTABLESIZE);
358             contractTable.addElement(entryTable);
359
360             // Add the initial character's current ordering first. then
361
// update its mapping to point to this contract table
362
entryTable.addElement(new EntryPair JavaDoc(groupChars.substring(0,Character.charCount(ch)), entry));
363             mapping.setElementAt(ch, tableIndex);
364         }
365
366         // Now add (or replace) this string in the table
367
int index = RBCollationTables.getEntry(entryTable, groupChars, fwd);
368         if (index != RBCollationTables.UNMAPPED) {
369             EntryPair JavaDoc pair = (EntryPair JavaDoc) entryTable.elementAt(index);
370             pair.value = anOrder;
371         } else {
372             EntryPair JavaDoc pair = (EntryPair JavaDoc)entryTable.lastElement();
373
374             // NOTE: This little bit of logic is here to speed CollationElementIterator
375
// .nextContractChar(). This code ensures that the longest sequence in
376
// this list is always the _last_ one in the list. This keeps
377
// nextContractChar() from having to search the entire list for the longest
378
// sequence.
379
if (groupChars.length() > pair.entryName.length()) {
380                 entryTable.addElement(new EntryPair JavaDoc(groupChars, anOrder, fwd));
381             } else {
382                 entryTable.insertElementAt(new EntryPair JavaDoc(groupChars, anOrder,
383                         fwd), entryTable.size() - 1);
384             }
385         }
386
387         // If this was a forward mapping for a contracting string, also add a
388
// reverse mapping for it, so that CollationElementIterator.previous
389
// can work right
390
if (fwd && groupChars.length() > 1) {
391             addContractFlags(groupChars);
392             addContractOrder(new StringBuffer JavaDoc(groupChars).reverse().toString(),
393                              anOrder, false);
394         }
395     }
396
397     /**
398      * If the given string has been specified as a contracting string
399      * in this collation table, return its ordering.
400      * Otherwise return UNMAPPED.
401      */

402     private int getContractOrder(String JavaDoc groupChars)
403     {
404         int result = RBCollationTables.UNMAPPED;
405         if (contractTable != null) {
406         int ch = groupChars.codePointAt(0);
407         /*
408             char ch0 = groupChars.charAt(0);
409             int ch = Character.isHighSurrogate(ch0)?
410           Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;
411           */

412             Vector JavaDoc entryTable = getContractValues(ch);
413             if (entryTable != null) {
414                 int index = RBCollationTables.getEntry(entryTable, groupChars, true);
415                 if (index != RBCollationTables.UNMAPPED) {
416                     EntryPair JavaDoc pair = (EntryPair JavaDoc) entryTable.elementAt(index);
417                     result = pair.value;
418                 }
419             }
420         }
421         return result;
422     }
423
424     private final int getCharOrder(int ch) {
425         int order = mapping.elementAt(ch);
426
427         if (order >= RBCollationTables.CONTRACTCHARINDEX) {
428             Vector JavaDoc groupList = getContractValuesImpl(order - RBCollationTables.CONTRACTCHARINDEX);
429             EntryPair JavaDoc pair = (EntryPair JavaDoc)groupList.firstElement();
430             order = pair.value;
431         }
432         return order;
433     }
434
435     /**
436      * Get the entry of hash table of the contracting string in the collation
437      * table.
438      * @param ch the starting character of the contracting string
439      */

440     private Vector JavaDoc getContractValues(int ch)
441     {
442         int index = mapping.elementAt(ch);
443         return getContractValuesImpl(index - RBCollationTables.CONTRACTCHARINDEX);
444     }
445
446     private Vector JavaDoc getContractValuesImpl(int index)
447     {
448         if (index >= 0)
449         {
450             return (Vector JavaDoc)contractTable.elementAt(index);
451         }
452         else // not found
453
{
454             return null;
455         }
456     }
457
458     /**
459      * Adds the expanding string into the collation table.
460      */

461     private final void addExpandOrder(String JavaDoc contractChars,
462                 String JavaDoc expandChars,
463                                 int anOrder) throws ParseException JavaDoc
464     {
465         // Create an expansion table entry
466
int tableIndex = addExpansion(anOrder, expandChars);
467
468         // And add its index into the main mapping table
469
if (contractChars.length() > 1) {
470             char ch = contractChars.charAt(0);
471             if (Character.isHighSurrogate(ch) && contractChars.length() == 2) {
472         char ch2 = contractChars.charAt(1);
473         if (Character.isLowSurrogate(ch2)) {
474             //only add into table when it is a legal surrogate
475
addOrder(Character.toCodePoint(ch, ch2), tableIndex);
476         }
477             } else {
478                 addContractOrder(contractChars, tableIndex);
479             }
480         } else {
481             addOrder(contractChars.charAt(0), tableIndex);
482         }
483     }
484
485     private final void addExpandOrder(int ch, String JavaDoc expandChars, int anOrder)
486       throws ParseException JavaDoc
487     {
488         int tableIndex = addExpansion(anOrder, expandChars);
489         addOrder(ch, tableIndex);
490     }
491
492     /**
493      * Create a new entry in the expansion table that contains the orderings
494      * for the given characers. If anOrder is valid, it is added to the
495      * beginning of the expanded list of orders.
496      */

497     private int addExpansion(int anOrder, String JavaDoc expandChars) {
498         if (expandTable == null) {
499             expandTable = new Vector JavaDoc(INITIALTABLESIZE);
500         }
501
502         // If anOrder is valid, we want to add it at the beginning of the list
503
int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1;
504
505     int[] valueList = new int[expandChars.length() + offset];
506         if (offset == 1) {
507             valueList[0] = anOrder;
508         }
509
510         int j = offset;
511         for (int i = 0; i < expandChars.length(); i++) {
512             char ch0 = expandChars.charAt(i);
513         char ch1;
514         int ch;
515             if (Character.isHighSurrogate(ch0)) {
516         if (++i == expandChars.length() ||
517             !Character.isLowSurrogate(ch1=expandChars.charAt(i))) {
518             //ether we are missing the low surrogate or the next char
519
//is not a legal low surrogate, so stop loop
520
break;
521                 }
522         ch = Character.toCodePoint(ch0, ch1);
523         
524         } else {
525         ch = ch0;
526         }
527
528             int mapValue = getCharOrder(ch);
529
530             if (mapValue != RBCollationTables.UNMAPPED) {
531                 valueList[j++] = mapValue;
532             } else {
533                 // can't find it in the table, will be filled in by commit().
534
valueList[j++] = CHARINDEX + ch;
535             }
536         }
537         if (j < valueList.length) {
538         //we had at least one supplementary character, the size of valueList
539
//is bigger than it really needs...
540
int[] tmpBuf = new int[j];
541             while (--j >= 0) {
542                 tmpBuf[j] = valueList[j];
543             }
544             valueList = tmpBuf;
545         }
546         // Add the expanding char list into the expansion table.
547
int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size();
548         expandTable.addElement(valueList);
549
550         return tableIndex;
551     }
552
553     private void addContractFlags(String JavaDoc chars) {
554         char c0;
555         int c;
556         int len = chars.length();
557         for (int i = 0; i < len; i++) {
558             c0 = chars.charAt(i);
559             c = Character.isHighSurrogate(c0)
560                           ?Character.toCodePoint(c0, chars.charAt(++i))
561                       :c0;
562             contractFlags.put(c, 1);
563         }
564     }
565
566     // ==============================================================
567
// constants
568
// ==============================================================
569
final static int CHARINDEX = 0x70000000; // need look up in .commit()
570

571     private final static int IGNORABLEMASK = 0x0000ffff;
572     private final static int PRIMARYORDERINCREMENT = 0x00010000;
573     private final static int SECONDARYORDERINCREMENT = 0x00000100;
574     private final static int TERTIARYORDERINCREMENT = 0x00000001;
575     private final static int INITIALTABLESIZE = 20;
576     private final static int MAXKEYSIZE = 5;
577
578     // ==============================================================
579
// instance variables
580
// ==============================================================
581

582     // variables used by the build process
583
private RBCollationTables.BuildAPI JavaDoc tables = null;
584     private MergeCollation JavaDoc mPattern = null;
585     private boolean isOverIgnore = false;
586     private char[] keyBuf = new char[MAXKEYSIZE];
587     private IntHashtable contractFlags = new IntHashtable(100);
588
589     // "shadow" copies of the instance variables in RBCollationTables
590
// (the values in these variables are copied back into RBCollationTables
591
// at the end of the build process)
592
private boolean frenchSec = false;
593     private boolean seAsianSwapping = false;
594
595     private UCompactIntArray mapping = null;
596     private Vector JavaDoc contractTable = null;
597     private Vector JavaDoc expandTable = null;
598
599     private short maxSecOrder = 0;
600     private short maxTerOrder = 0;
601 }
602
Popular Tags