KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > RBBISetBuilder


1 /*
2  *******************************************************************************
3  * Copyright (C) 2003-2006,
4  * International Business Machines Corporation and others. All Rights Reserved.
5  *******************************************************************************
6  */

7  
8 package com.ibm.icu.text;
9 import java.util.List JavaDoc;
10 import java.util.ArrayList JavaDoc;
11 import java.util.Iterator JavaDoc;
12 import java.io.OutputStream JavaDoc;
13 import java.io.IOException JavaDoc;
14
15 import com.ibm.icu.impl.Assert;
16 import com.ibm.icu.impl.CharTrie;
17 import com.ibm.icu.impl.Trie;
18 import com.ibm.icu.impl.IntTrieBuilder;
19
20 //
21
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
22
// (part of the rule building process.)
23
//
24
// Starting with the rules parse tree from the scanner,
25
//
26
// - Enumerate the set of UnicodeSets that are referenced
27
// by the RBBI rules.
28
// - compute a set of non-overlapping character ranges
29
// with all characters within a range belonging to the same
30
// set of input uniocde sets.
31
// - Derive a set of non-overlapping UnicodeSet (like things)
32
// that will correspond to columns in the state table for
33
// the RBBI execution engine. All characters within one
34
// of these sets belong to the same set of the original
35
// UnicodeSets from the user's rules.
36
// - construct the trie table that maps input characters
37
// to the index of the matching non-overlapping set of set from
38
// the previous step.
39
//
40
class RBBISetBuilder {
41     static class RangeDescriptor {
42            int fStartChar; // Start of range, unicode 32 bit value.
43
int fEndChar; // End of range, unicode 32 bit value.
44
int fNum; // runtime-mapped input value for this range.
45
List JavaDoc fIncludesSets; // vector of the the original
46
// Unicode sets that include this range.
47
// (Contains ptrs to uset nodes)
48
RangeDescriptor fNext; // Next RangeDescriptor in the linked list.
49

50             RangeDescriptor() {
51                 fIncludesSets = new ArrayList JavaDoc();
52             }
53             
54             RangeDescriptor(RangeDescriptor other) {
55                 fStartChar = other.fStartChar;
56                 fEndChar = other.fEndChar;
57                 fNum = other.fNum;
58                 fIncludesSets = new ArrayList JavaDoc(other.fIncludesSets);
59             }
60  
61             //-------------------------------------------------------------------------------------
62
//
63
// RangeDesriptor::split()
64
//
65
//-------------------------------------------------------------------------------------
66
void split(int where) {
67                 Assert.assrt(where>fStartChar && where<=fEndChar);
68                 RangeDescriptor nr = new RangeDescriptor(this);
69  
70                 // RangeDescriptor copy constructor copies all fields.
71
// Only need to update those that are different after the split.
72
nr.fStartChar = where;
73                 this.fEndChar = where-1;
74                 nr.fNext = this.fNext;
75                 this.fNext = nr;
76                 
77                 // TODO: fIncludesSets is not updated. Check it out.
78
// Probably because they haven't been populated yet,
79
// but still sloppy.
80
}
81
82             
83             //-------------------------------------------------------------------------------------
84
//
85
// RangeDescriptor::setDictionaryFlag
86
//
87
// Character Category Numbers that include characters from
88
// the original Unicode Set named "dictionary" have bit 14
89
// set to 1. The RBBI runtime engine uses this to trigger
90
// use of the word dictionary.
91
//
92
// This function looks through the Unicode Sets that it
93
// (the range) includes, and sets the bit in fNum when
94
// "dictionary" is among them.
95
//
96
// TODO: a faster way would be to find the set node for
97
// "dictionary" just once, rather than looking it
98
// up by name every time.
99
//
100
// -------------------------------------------------------------------------------------
101
void setDictionaryFlag() {
102                 int i;
103                 
104                 for (i=0; i<this.fIncludesSets.size(); i++) {
105                     RBBINode usetNode = (RBBINode)fIncludesSets.get(i);
106                     String JavaDoc setName = "";
107                     RBBINode setRef = usetNode.fParent;
108                     if (setRef != null) {
109                         RBBINode varRef = setRef.fParent;
110                         if (varRef != null && varRef.fType == RBBINode.varRef) {
111                             setName = varRef.fText;
112                         }
113                     }
114                     if (setName.equals("dictionary")) {
115                         this.fNum |= 0x4000;
116                         break;
117                     }
118                 }
119
120         };
121     }
122
123     
124     RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us.
125
RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors
126

127     IntTrieBuilder fTrie; // The mapping TRIE that is the end result of processing
128
int fTrieSize; // the Unicode Sets.
129

130     // Groups correspond to character categories -
131
// groups of ranges that are in the same original UnicodeSets.
132
// fGroupCount is the index of the last used group.
133
// fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
134
// State table column 0 is not used. Column 1 is for end-of-input.
135
// column 2 is for group 0. Funny counting.
136
int fGroupCount;
137
138     boolean fSawBOF;
139     
140     
141     //------------------------------------------------------------------------
142
//
143
// RBBISetBuilder Constructor
144
//
145
//------------------------------------------------------------------------
146
RBBISetBuilder(RBBIRuleBuilder rb)
147     {
148         fRB = rb;
149     }
150
151
152     
153         //------------------------------------------------------------------------
154
//
155
// build Build the list of non-overlapping character ranges
156
// from the Unicode Sets.
157
//
158
//------------------------------------------------------------------------
159
void build() {
160             RBBINode usetNode;
161             RangeDescriptor rlRange;
162
163             if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();}
164
165             // Initialize the process by creating a single range encompassing all characters
166
// that is in no sets.
167
//
168
fRangeList = new RangeDescriptor();
169             fRangeList.fStartChar = 0;
170             fRangeList.fEndChar = 0x10ffff;
171
172             //
173
// Find the set of non-overlapping ranges of characters
174
//
175
Iterator JavaDoc ni = fRB.fUSetNodes.iterator();
176             while (ni.hasNext()) {
177                 usetNode = (RBBINode)ni.next();
178  
179                 UnicodeSet inputSet = usetNode.fInputSet;
180                 int inputSetRangeCount = inputSet.getRangeCount();
181                 int inputSetRangeIndex = 0;
182                                  rlRange = fRangeList;
183
184                 for (;;) {
185                     if (inputSetRangeIndex >= inputSetRangeCount) {
186                         break;
187                     }
188                     int inputSetRangeBegin = inputSet.getRangeStart(inputSetRangeIndex);
189                     int inputSetRangeEnd = inputSet.getRangeEnd(inputSetRangeIndex);
190
191                     // skip over ranges from the range list that are completely
192
// below the current range from the input unicode set.
193
while (rlRange.fEndChar < inputSetRangeBegin) {
194                         rlRange = rlRange.fNext;
195                     }
196
197                     // If the start of the range from the range list is before with
198
// the start of the range from the unicode set, split the range list range
199
// in two, with one part being before (wholly outside of) the unicode set
200
// and the other containing the rest.
201
// Then continue the loop; the post-split current range will then be skipped
202
// over
203
if (rlRange.fStartChar < inputSetRangeBegin) {
204                         rlRange.split(inputSetRangeBegin);
205                          continue;
206                     }
207
208                     // Same thing at the end of the ranges...
209
// If the end of the range from the range list doesn't coincide with
210
// the end of the range from the unicode set, split the range list
211
// range in two. The first part of the split range will be
212
// wholly inside the Unicode set.
213
if (rlRange.fEndChar > inputSetRangeEnd) {
214                         rlRange.split(inputSetRangeEnd+1);
215                      }
216
217                     // The current rlRange is now entirely within the UnicodeSet range.
218
// Add this unicode set to the list of sets for this rlRange
219
if (rlRange.fIncludesSets.indexOf(usetNode) == -1) {
220                         rlRange.fIncludesSets.add(usetNode);
221                     }
222
223                     // Advance over ranges that we are finished with.
224
if (inputSetRangeEnd == rlRange.fEndChar) {
225                         inputSetRangeIndex++;
226                     }
227                     rlRange = rlRange.fNext;
228                 }
229             }
230
231             if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("range")>=0) { printRanges();}
232
233             //
234
// Group the above ranges, with each group consisting of one or more
235
// ranges that are in exactly the same set of original UnicodeSets.
236
// The groups are numbered, and these group numbers are the set of
237
// input symbols recognized by the run-time state machine.
238
//
239
// Numbering: # 0 (state table column 0) is unused.
240
// # 1 is reserved - table column 1 is for end-of-input
241
// # 2 is reserved - table column 2 is for beginning-in-input
242
// # 3 is the first range list.
243
//
244
RangeDescriptor rlSearchRange;
245             for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
246                 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) {
247                     if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) {
248                         rlRange.fNum = rlSearchRange.fNum;
249                         break;
250                     }
251                 }
252                 if (rlRange.fNum == 0) {
253                     fGroupCount ++;
254                     rlRange.fNum = fGroupCount+2;
255                     rlRange.setDictionaryFlag();
256                     addValToSets(rlRange.fIncludesSets, fGroupCount+2);
257                 }
258             }
259
260             // Handle input sets that contain the special string {eof}.
261
// Column 1 of the state table is reserved for EOF on input.
262
// Column 2 is reserved for before-the-start-input.
263
// (This column can be optimized away later if there are no rule
264
// references to {bof}.)
265
// Add this column value (1 or 2) to the equivalent expression
266
// subtree for each UnicodeSet that contains the string {eof}
267
// Because {bof} and {eof} are not a characters in the normal sense,
268
// they doesn't affect the computation of ranges or TRIE.
269

270             String JavaDoc eofString = "eof";
271             String JavaDoc bofString = "bof";
272
273             ni = fRB.fUSetNodes.iterator();
274             while (ni.hasNext()) {
275                 usetNode = (RBBINode )ni.next();
276                 UnicodeSet inputSet = usetNode.fInputSet;
277                 if (inputSet.contains(eofString)) {
278                     addValToSet(usetNode, 1);
279                 }
280                 if (inputSet.contains(bofString)) {
281                     addValToSet(usetNode, 2);
282                     fSawBOF = true;
283                 }
284             }
285
286
287             if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();}
288             if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();}
289
290
291             //IntTrieBuilder(int aliasdata[], int maxdatalength,
292
// int initialvalue, int leadunitvalue,
293
// boolean latin1linear)
294

295             fTrie = new IntTrieBuilder(null, // Data array (utrie will allocate one)
296
100000, // Max Data Length
297
0, // Initial value for all code points
298
0, // Lead Surrogate unit value,
299
true); // Keep Latin 1 in separately.
300

301             for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
302                 fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar+1, rlRange.fNum, true);
303             }
304         }
305
306
307
308         //-----------------------------------------------------------------------------------
309
//
310
// RBBIDataManipulate A little internal class needed only to wrap of the
311
// getFoldedValue() function needed for Trie table creation.
312
//
313
//-----------------------------------------------------------------------------------
314
class RBBIDataManipulate implements IntTrieBuilder.DataManipulate {
315             public int getFoldedValue(int start, int offset) {
316                 int value;
317                 int limit;
318                 boolean [] inBlockZero = new boolean[1];
319                 
320                 limit = start + 0x400;
321                 while(start<limit) {
322                     value = fTrie.getValue(start, inBlockZero);
323                     if (inBlockZero[0]) {
324                         start += IntTrieBuilder.DATA_BLOCK_LENGTH;
325                     } else if (value != 0) {
326                         return offset | 0x08000;
327                     } else {
328                         ++start;
329                     }
330                 }
331                 return 0;
332              }
333         }
334         RBBIDataManipulate dm = new RBBIDataManipulate();
335         
336         //-----------------------------------------------------------------------------------
337
//
338
// getTrieSize() Return the size that will be required to serialize the Trie.
339
//
340
//-----------------------------------------------------------------------------------
341
int getTrieSize() {
342             int size = 0;
343             try {
344                 // The trie serialize function returns the size of the data written.
345
// null output stream says give size only, don't actually write anything.
346
size = fTrie.serialize(null, true, dm );
347             } catch (IOException JavaDoc e) {
348                 Assert.assrt (false);
349             }
350             return size;
351         }
352
353
354         //-----------------------------------------------------------------------------------
355
//
356
// serializeTrie() Write the serialized trie to an output stream
357
//
358
//-----------------------------------------------------------------------------------
359
void serializeTrie(OutputStream JavaDoc os) throws IOException JavaDoc {
360             fTrie.serialize(os, true, dm );
361        }
362
363         //------------------------------------------------------------------------
364
//
365
// addValToSets Add a runtime-mapped input value to each uset from a
366
// list of uset nodes. (val corresponds to a state table column.)
367
// For each of the original Unicode sets - which correspond
368
// directly to uset nodes - a logically equivalent expression
369
// is constructed in terms of the remapped runtime input
370
// symbol set. This function adds one runtime input symbol to
371
// a list of sets.
372
//
373
// The "logically equivalent expression" is the tree for an
374
// or-ing together of all of the symbols that go into the set.
375
//
376
//------------------------------------------------------------------------
377
void addValToSets(List JavaDoc sets, int val) {
378             int ix;
379
380             for (ix=0; ix<sets.size(); ix++) {
381                 RBBINode usetNode = (RBBINode )sets.get(ix);
382                 addValToSet(usetNode, val);
383             }
384         }
385
386         void addValToSet(RBBINode usetNode, int val) {
387             RBBINode leafNode = new RBBINode(RBBINode.leafChar);
388             leafNode.fVal = val;
389             if (usetNode.fLeftChild == null) {
390                 usetNode.fLeftChild = leafNode;
391                 leafNode.fParent = usetNode;
392             } else {
393                 // There are already input symbols present for this set.
394
// Set up an OR node, with the previous stuff as the left child
395
// and the new value as the right child.
396
RBBINode orNode = new RBBINode(RBBINode.opOr);
397                 orNode.fLeftChild = usetNode.fLeftChild;
398                 orNode.fRightChild = leafNode;
399                 orNode.fLeftChild.fParent = orNode;
400                 orNode.fRightChild.fParent = orNode;
401                 usetNode.fLeftChild = orNode;
402                 orNode.fParent = usetNode;
403             }
404         }
405
406
407         //------------------------------------------------------------------------
408
//
409
// getNumCharCategories
410
//
411
//------------------------------------------------------------------------
412
int getNumCharCategories() {
413             return fGroupCount + 3;
414         }
415
416
417         //------------------------------------------------------------------------
418
//
419
// sawBOF
420
//
421
//------------------------------------------------------------------------
422
boolean sawBOF() {
423             return fSawBOF;
424         }
425
426
427         //------------------------------------------------------------------------
428
//
429
// getFirstChar Given a runtime RBBI character category, find
430
// the first UChar32 that is in the set of chars
431
// in the category.
432
//------------------------------------------------------------------------
433
int getFirstChar(int category) {
434             RangeDescriptor rlRange;
435             int retVal = -1;
436             for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
437                 if (rlRange.fNum == category) {
438                     retVal = rlRange.fStartChar;
439                     break;
440                 }
441             }
442             return retVal;
443         }
444
445
446
447         //------------------------------------------------------------------------
448
//
449
// printRanges A debugging function.
450
// dump out all of the range definitions.
451
//
452
//------------------------------------------------------------------------
453
void printRanges() {
454             RangeDescriptor rlRange;
455             int i;
456
457             System.out.print("\n\n Nonoverlapping Ranges ...\n");
458             for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
459                 System.out.print(" " + rlRange.fNum + " " + (int)rlRange.fStartChar + "-" + (int)rlRange.fEndChar);
460
461                 for (i=0; i<rlRange.fIncludesSets.size(); i++) {
462                     RBBINode usetNode = (RBBINode )rlRange.fIncludesSets.get(i);
463                     String JavaDoc setName = "anon";
464                     RBBINode setRef = usetNode.fParent;
465                     if (setRef != null) {
466                         RBBINode varRef = setRef.fParent;
467                         if (varRef != null && varRef.fType == RBBINode.varRef) {
468                             setName = varRef.fText;
469                         }
470                     }
471                     System.out.print(setName); System.out.print(" ");
472                 }
473                 System.out.println("");
474             }
475         }
476
477
478         //------------------------------------------------------------------------
479
//
480
// printRangeGroups A debugging function.
481
// dump out all of the range groups.
482
//
483
//------------------------------------------------------------------------
484
void printRangeGroups() {
485             RangeDescriptor rlRange;
486             RangeDescriptor tRange;
487             int i;
488             int lastPrintedGroupNum = 0;
489
490             System.out.print("\nRanges grouped by Unicode Set Membership...\n");
491             for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
492                 int groupNum = rlRange.fNum & 0xbfff;
493                 if (groupNum > lastPrintedGroupNum) {
494                     lastPrintedGroupNum = groupNum;
495                     if (groupNum<10) {System.out.print(" ");}
496                     System.out.print(groupNum + " ");
497
498                     if ((rlRange.fNum & 0x4000) != 0) { System.out.print(" <DICT> ");}
499
500                     for (i=0; i<rlRange.fIncludesSets.size(); i++) {
501                         RBBINode usetNode = (RBBINode )rlRange.fIncludesSets.get(i);
502                         String JavaDoc setName = "anon";
503                         RBBINode setRef = usetNode.fParent;
504                         if (setRef != null) {
505                             RBBINode varRef = setRef.fParent;
506                             if (varRef != null && varRef.fType == RBBINode.varRef) {
507                                 setName = varRef.fText;
508                             }
509                         }
510                         System.out.print(setName); System.out.print(" ");
511                     }
512
513                     i = 0;
514                     for (tRange = rlRange; tRange != null; tRange = tRange.fNext) {
515                         if (tRange.fNum == rlRange.fNum) {
516                             if (i++ % 5 == 0) {
517                                 System.out.print("\n ");
518                             }
519                             RBBINode.printHex((int)tRange.fStartChar, -1);
520                             System.out.print("-");
521                             RBBINode.printHex((int)tRange.fEndChar, 0);
522                         }
523                     }
524                     System.out.print("\n");
525                 }
526             }
527             System.out.print("\n");
528         }
529
530
531         //------------------------------------------------------------------------
532
//
533
// printSets A debugging function.
534
// dump out all of the set definitions.
535
//
536
//------------------------------------------------------------------------
537
void printSets() {
538             int i;
539             System.out.print("\n\nUnicode Sets List\n------------------\n");
540             for (i=0; i<fRB.fUSetNodes.size(); i++) {
541                 RBBINode usetNode;
542                 RBBINode setRef;
543                 RBBINode varRef;
544                 String JavaDoc setName;
545
546                 usetNode = (RBBINode )fRB.fUSetNodes.get(i);
547
548                 //System.out.print(" " + i + " ");
549
RBBINode.printInt(2, i);
550                 setName = "anonymous";
551                 setRef = usetNode.fParent;
552                 if (setRef != null) {
553                     varRef = setRef.fParent;
554                     if (varRef != null && varRef.fType == RBBINode.varRef) {
555                         setName = varRef.fText;
556                     }
557                 }
558                 System.out.print(" " + setName);
559                 System.out.print(" ");
560                 System.out.print(usetNode.fText);
561                 System.out.print("\n");
562                 if (usetNode.fLeftChild != null) {
563                     usetNode.fLeftChild.printTree(true);
564                 }
565             }
566             System.out.print("\n");
567         }
568  
569
570     
571     
572 }
573
Popular Tags