KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > RBBIDataWrapper


1 /**
2 *******************************************************************************
3 * Copyright (C) 1996-2006, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
6 */

7
8 package com.ibm.icu.text;
9
10 import java.io.BufferedInputStream JavaDoc;
11 import java.io.InputStream JavaDoc;
12 import java.io.DataInputStream JavaDoc;
13 import java.io.IOException JavaDoc;
14
15 import com.ibm.icu.impl.ICUData;
16 import com.ibm.icu.impl.ICUResourceBundle;
17 import com.ibm.icu.impl.Trie;
18 import com.ibm.icu.impl.CharTrie;
19
20 /**
21 * <p>Internal class used for Rule Based Break Iterators</p>
22 * <p>This class provides access to the compiled break rule data, as
23 * it is stored in a .brk file.
24 * @internal
25 *
26 */

27 final class RBBIDataWrapper {
28     //
29
// These fields are the ready-to-use compiled rule data, as
30
// read from the file.
31
//
32
RBBIDataHeader fHeader;
33     short fFTable[];
34     short fRTable[];
35     short fSFTable[];
36     short fSRTable[];
37     CharTrie fTrie;
38     String JavaDoc fRuleSource;
39     int fStatusTable[];
40     
41     //
42
// Indexes to fields in the ICU4C style binary form of the RBBI Data Header
43
// Used by the rule compiler when flattening the data.
44
//
45
final static int DH_SIZE = 24;
46     final static int DH_MAGIC = 0;
47     final static int DH_FORMATVERSION = 1;
48     final static int DH_LENGTH = 2;
49     final static int DH_CATCOUNT = 3;
50     final static int DH_FTABLE = 4;
51     final static int DH_FTABLELEN = 5;
52     final static int DH_RTABLE = 6;
53     final static int DH_RTABLELEN = 7;
54     final static int DH_SFTABLE = 8;
55     final static int DH_SFTABLELEN = 9;
56     final static int DH_SRTABLE = 10;
57     final static int DH_SRTABLELEN = 11;
58     final static int DH_TRIE = 12;
59     final static int DH_TRIELEN = 13;
60     final static int DH_RULESOURCE = 14;
61     final static int DH_RULESOURCELEN = 15;
62     final static int DH_STATUSTABLE = 16;
63     final static int DH_STATUSTABLELEN = 17;
64     
65     
66     // Index offsets to the fields in a state table row.
67
// Corresponds to struct RBBIStateTableRow in the C version.
68
//
69
final static int ACCEPTING = 0;
70     final static int LOOKAHEAD = 1;
71     final static int TAGIDX = 2;
72     final static int RESERVED = 3;
73     final static int NEXTSTATES = 4;
74     
75     // Index offsets to header fields of a state table
76
// struct RBBIStateTable {... in the C version.
77
//
78
final static int NUMSTATES = 0;
79     final static int ROWLEN = 2;
80     final static int FLAGS = 4;
81     final static int RESERVED_2 = 6;
82     final static int ROW_DATA = 8;
83     
84     // Bit selectors for the "FLAGS" field of the state table header
85
// enum RBBIStateTableFlags in the C version.
86
//
87
final static int RBBI_LOOKAHEAD_HARD_BREAK = 1;
88     final static int RBBI_BOF_REQUIRED = 2;
89     
90     // Getters for fields from the state table header
91
//
92
final static int getNumStates(short table[]) {
93         int hi = table[NUMSTATES];
94         int lo = table[NUMSTATES+1];
95         int val = (hi<<16) + (lo&0x0000ffff);
96         return val;
97      }
98     
99     
100     /**
101      * Data Header. A struct-like class with the fields from the RBBI data file header.
102      */

103     final static class RBBIDataHeader {
104         int fMagic; // == 0xbla0
105
int fVersion; // == 1 (for ICU 3.2 and earlier.
106
byte[] fFormatVersion; // For ICU 3.4 and later.
107
int fLength; // Total length in bytes of this RBBI Data,
108
// including all sections, not just the header.
109
int fCatCount; // Number of character categories.
110

111         //
112
// Offsets and sizes of each of the subsections within the RBBI data.
113
// All offsets are bytes from the start of the RBBIDataHeader.
114
// All sizes are in bytes.
115
//
116
int fFTable; // forward state transition table.
117
int fFTableLen;
118         int fRTable; // Offset to the reverse state transition table.
119
int fRTableLen;
120         int fSFTable; // safe point forward transition table
121
int fSFTableLen;
122         int fSRTable; // safe point reverse transition table
123
int fSRTableLen;
124         int fTrie; // Offset to Trie data for character categories
125
int fTrieLen;
126         int fRuleSource; // Offset to the source for for the break
127
int fRuleSourceLen; // rules. Stored UChar *.
128
int fStatusTable; // Offset to the table of rule status values
129
int fStatusTableLen;
130
131         public RBBIDataHeader() {
132             fMagic = 0;
133             fFormatVersion = new byte[4];
134         }
135     }
136     
137     
138     /**
139      * RBBI State Table Indexing Function. Given a state number, return the
140      * array index of the start of the state table row for that state.
141      *
142      */

143     int getRowIndex(int state){
144         return ROW_DATA + state * (fHeader.fCatCount + 4);
145     }
146     
147     static class TrieFoldingFunc implements Trie.DataManipulate {
148         public int getFoldingOffset(int data) {
149             if ((data & 0x8000) != 0) {
150                 return data & 0x7fff;
151             } else {
152                 return 0;
153             }
154         }
155     }
156     static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
157  
158     
159     RBBIDataWrapper() {
160     }
161
162     static RBBIDataWrapper get(String JavaDoc name) throws IOException JavaDoc {
163         String JavaDoc fullName = "data/" + name;
164         InputStream JavaDoc is = ICUData.getRequiredStream(fullName);
165         return get(is);
166     }
167     
168     /*
169      * Get an RBBIDataWrapper from an InputStream onto a pre-compiled set
170      * of RBBI rules.
171      */

172     static RBBIDataWrapper get(InputStream JavaDoc is) throws IOException JavaDoc {
173         int i;
174         
175         DataInputStream JavaDoc dis = new DataInputStream JavaDoc(new BufferedInputStream JavaDoc(is));
176         RBBIDataWrapper This = new RBBIDataWrapper();
177         
178         // Seek past the ICU data header.
179
// TODO: verify that the header looks good.
180
dis.skip(0x80);
181         
182         // Read in the RBBI data header...
183
This.fHeader = new RBBIDataHeader();
184         This.fHeader.fMagic = dis.readInt();
185         This.fHeader.fVersion = dis.readInt();
186         This.fHeader.fFormatVersion[0] = (byte) (This.fHeader.fVersion >> 24);
187         This.fHeader.fFormatVersion[1] = (byte) (This.fHeader.fVersion >> 16);
188         This.fHeader.fFormatVersion[2] = (byte) (This.fHeader.fVersion >> 8);
189         This.fHeader.fFormatVersion[3] = (byte) (This.fHeader.fVersion);
190         This.fHeader.fLength = dis.readInt();
191         This.fHeader.fCatCount = dis.readInt();
192         This.fHeader.fFTable = dis.readInt();
193         This.fHeader.fFTableLen = dis.readInt();
194         This.fHeader.fRTable = dis.readInt();
195         This.fHeader.fRTableLen = dis.readInt();
196         This.fHeader.fSFTable = dis.readInt();
197         This.fHeader.fSFTableLen = dis.readInt();
198         This.fHeader.fSRTable = dis.readInt();
199         This.fHeader.fSRTableLen = dis.readInt();
200         This.fHeader.fTrie = dis.readInt();
201         This.fHeader.fTrieLen = dis.readInt();
202         This.fHeader.fRuleSource = dis.readInt();
203         This.fHeader.fRuleSourceLen = dis.readInt();
204         This.fHeader.fStatusTable = dis.readInt();
205         This.fHeader.fStatusTableLen = dis.readInt();
206         dis.skip(6 * 4); // uint32_t fReserved[6];
207

208         
209         if (This.fHeader.fMagic != 0xb1a0 ||
210                 ! (This.fHeader.fVersion == 1 || // ICU 3.2 and earlier
211
This.fHeader.fFormatVersion[0] == 3) // ICU 3.4
212
) {
213             throw new IOException JavaDoc("Break Iterator Rule Data Magic Number Incorrect, or unsupported data version.");
214         }
215         
216         // Current position in input stream.
217
int pos = 24 * 4; // offset of end of header, which has 24 fields, all int32_t (4 bytes)
218

219         //
220
// Read in the Forward state transition table as an array of shorts.
221
//
222

223         // Quick Sanity Check
224
if (This.fHeader.fFTable < pos || This.fHeader.fFTable > This.fHeader.fLength) {
225              throw new IOException JavaDoc("Break iterator Rule data corrupt");
226         }
227         
228         // Skip over any padding preceding this table
229
dis.skip(This.fHeader.fFTable - pos);
230         pos = This.fHeader.fFTable;
231         
232         This.fFTable = new short[This.fHeader.fFTableLen / 2];
233         for ( i=0; i<This.fFTable.length; i++) {
234             This.fFTable[i] = dis.readShort();
235             pos += 2;
236         }
237         
238         //
239
// Read in the Reverse state table
240
//
241

242         // Skip over any padding in the file
243
dis.skip(This.fHeader.fRTable - pos);
244         pos = This.fHeader.fRTable;
245         
246         // Create & fill the table itself.
247
This.fRTable = new short[This.fHeader.fRTableLen / 2];
248         for (i=0; i<This.fRTable.length; i++) {
249             This.fRTable[i] = dis.readShort();
250             pos += 2;
251         }
252         
253         //
254
// Read in the Safe Forward state table
255
//
256
if (This.fHeader.fSFTableLen > 0) {
257             // Skip over any padding in the file
258
dis.skip(This.fHeader.fSFTable - pos);
259             pos = This.fHeader.fSFTable;
260             
261             // Create & fill the table itself.
262
This.fSFTable = new short[This.fHeader.fSFTableLen / 2];
263             for (i=0; i<This.fSFTable.length; i++) {
264                 This.fSFTable[i] = dis.readShort();
265                 pos += 2;
266             }
267         }
268         
269         //
270
// Read in the Safe Reverse state table
271
//
272
if (This.fHeader.fSRTableLen > 0) {
273             // Skip over any padding in the file
274
dis.skip(This.fHeader.fSRTable - pos);
275             pos = This.fHeader.fSRTable;
276             
277             // Create & fill the table itself.
278
This.fSRTable = new short[This.fHeader.fSRTableLen / 2];
279             for (i=0; i<This.fSRTable.length; i++) {
280                 This.fSRTable[i] = dis.readShort();
281                 pos += 2;
282             }
283         }
284         
285         //
286
// Unserialize the Character categories TRIE
287
// Because we can't be absolutely certain where the Trie deserialize will
288
// leave the input stream, leave position unchanged.
289
// The seek to the start of the next item following the TRIE will get us
290
// back in sync.
291
//
292
dis.skip(This.fHeader.fTrie - pos); // seek input stream from end of previous section to
293
pos = This.fHeader.fTrie; // to the start of the trie
294

295         dis.mark(This.fHeader.fTrieLen+100); // Mark position of start of TRIE in the input
296
// and tell Java to keep the mark valid so long
297
// as we don't go more than 100 bytes past the
298
// past the end of the TRIE.
299

300         This.fTrie = new CharTrie(dis, fTrieFoldingFunc); // Deserialize the TRIE, leaving input
301
// stream at an unknown position, preceding the
302
// padding between TRIE and following section.
303

304         dis.reset(); // Move input stream back to marked position at
305
// the start of the serialized TRIE. Now our
306
// "pos" variable and the input stream are in
307
// agreement.
308

309         //
310
// Read the Rule Status Table
311
//
312
if (pos > This.fHeader.fStatusTable) {
313             throw new IOException JavaDoc("Break iterator Rule data corrupt");
314         }
315         dis.skip(This.fHeader.fStatusTable - pos);
316         pos = This.fHeader.fStatusTable;
317         This.fStatusTable = new int[This.fHeader.fStatusTableLen / 4];
318         for (i=0; i<This.fStatusTable.length; i++) {
319             This.fStatusTable[i] = dis.readInt();
320             pos += 4;
321         }
322         
323         //
324
// Put the break rule source into a String
325
//
326
if (pos > This.fHeader.fRuleSource) {
327             throw new IOException JavaDoc("Break iterator Rule data corrupt");
328         }
329         dis.skip(This.fHeader.fRuleSource - pos);
330         pos = This.fHeader.fRuleSource;
331         StringBuffer JavaDoc sb = new StringBuffer JavaDoc(This.fHeader.fRuleSourceLen / 2);
332         for (i=0; i<This.fHeader.fRuleSourceLen; i+=2) {
333             sb.append(dis.readChar());
334             pos += 2;
335         }
336         This.fRuleSource = sb.toString();
337         
338         if (RuleBasedBreakIterator.fDebugEnv!=null && RuleBasedBreakIterator.fDebugEnv.indexOf("data")>=0) {
339             This.dump();
340         }
341         return This;
342     }
343     
344     
345     
346     /** Debug function to display the break iterator data.
347      * @internal
348      */

349     void dump() {
350         System.out.println("RBBI Data Wrapper dump ...");
351         System.out.println();
352         System.out.println("Forward State Table");
353         dumpTable(fFTable);
354         System.out.println("Reverse State Table");
355         dumpTable(fRTable);
356         System.out.println("Forward Safe Points Table");
357         dumpTable(fSFTable);
358         System.out.println("Reverse Safe Points Table");
359         dumpTable(fSRTable);
360         
361         dumpCharCategories();
362         System.out.println("Source Rules: " + fRuleSource);
363         
364     }
365     
366     /** Fixed width int-to-string conversion.
367      * @internal
368      *
369      */

370     static public String JavaDoc intToString(int n, int width) {
371         StringBuffer JavaDoc dest = new StringBuffer JavaDoc(width);
372         dest.append(n);
373         while (dest.length() < width) {
374            dest.insert(0, ' ');
375         }
376         return dest.toString();
377     }
378     
379     /** Fixed width int-to-string conversion.
380      * @internal
381      *
382      */

383     static public String JavaDoc intToHexString(int n, int width) {
384         StringBuffer JavaDoc dest = new StringBuffer JavaDoc(width);
385         dest.append(Integer.toHexString(n));
386         while (dest.length() < width) {
387            dest.insert(0, ' ');
388         }
389         return dest.toString();
390     }
391     
392     /** Dump a state table. (A full set of RBBI rules has 4 state tables.) */
393     private void dumpTable(short table[]) {
394         if (table == null) {
395             System.out.println(" -- null -- ");
396         } else {
397             int n;
398             int state;
399             String JavaDoc header = " Row Acc Look Tag";
400             for (n=0; n<fHeader.fCatCount; n++) {
401                 header += intToString(n, 5);
402             }
403             System.out.println(header);
404             for (n=0; n<header.length(); n++) {
405                 System.out.print("-");
406             }
407             System.out.println();
408             for (state=0; state< getNumStates(table); state++) {
409                 dumpRow(table, state);
410             }
411             System.out.println();
412         }
413     }
414     
415     /**
416      * Dump (for debug) a single row of an RBBI state table
417      * @param table
418      * @param state
419      * @internal
420      */

421     private void dumpRow(short table[], int state) {
422         StringBuffer JavaDoc dest = new StringBuffer JavaDoc(fHeader.fCatCount*5 + 20);
423         dest.append(intToString(state, 4));
424         int row = getRowIndex(state);
425         if (table[row+ACCEPTING] != 0) {
426            dest.append(intToString(table[row+ACCEPTING], 5));
427         }else {
428             dest.append(" ");
429         }
430         if (table[row+LOOKAHEAD] != 0) {
431             dest.append(intToString(table[row+LOOKAHEAD], 5));
432         }else {
433             dest.append(" ");
434         }
435         dest.append(intToString(table[row+TAGIDX], 5));
436         
437         for (int col=0; col<fHeader.fCatCount; col++) {
438             dest.append(intToString(table[row+NEXTSTATES+col], 5));
439         }
440
441         System.out.println(dest);
442     }
443     
444     private void dumpCharCategories() {
445         int n = fHeader.fCatCount;
446         String JavaDoc catStrings[] = new String JavaDoc[n+1];
447         int rangeStart = 0;
448         int rangeEnd = 0;
449         int lastCat = -1;
450         int char32;
451         int category;
452         int lastNewline[] = new int[n+1];
453         
454         for (category = 0; category <= fHeader.fCatCount; category ++) {
455             catStrings[category] = "";
456         }
457         System.out.println("\nCharacter Categories");
458         System.out.println("--------------------");
459         for (char32 = 0; char32<=0x10ffff; char32++) {
460             category = fTrie.getCodePointValue(char32);
461             category &= ~0x4000; // Mask off dictionary bit.
462
if (category < 0 || category > fHeader.fCatCount) {
463                 System.out.println("Error, bad category " + Integer.toHexString(category) +
464                         " for char " + Integer.toHexString(char32));
465                 break;
466             }
467             if (category == lastCat ) {
468                 rangeEnd = char32;
469             } else {
470                 if (lastCat >= 0) {
471                     if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) {
472                         lastNewline[lastCat] = catStrings[lastCat].length() + 10;
473                         catStrings[lastCat] += "\n ";
474                     }
475                     
476                     catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
477                     if (rangeEnd != rangeStart) {
478                         catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
479                     }
480                 }
481                 lastCat = category;
482                 rangeStart = rangeEnd = char32;
483             }
484         }
485         catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
486         if (rangeEnd != rangeStart) {
487             catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
488         }
489         
490         for (category = 0; category <= fHeader.fCatCount; category ++) {
491             System.out.println (intToString(category, 5) + " " + catStrings[category]);
492         }
493         System.out.println();
494     }
495     
496     public static void main(String JavaDoc[] args) {
497         String JavaDoc s;
498         if (args.length == 0) {
499             s = "char";
500         } else {
501             s = args[0];
502         }
503         System.out.println("RBBIDataWrapper.main(" + s + ") ");
504         
505         String JavaDoc versionedName = ICUResourceBundle.ICU_BUNDLE+"/"+ s + ".brk";
506         
507         try {
508             RBBIDataWrapper This = RBBIDataWrapper.get(versionedName);
509             This.dump();
510         }
511        catch (Exception JavaDoc e) {
512            System.out.println("Exception: " + e.toString());
513        }
514            
515     }
516
517 }
518
Popular Tags