KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > RBBIRuleBuilder


1 //
2
// Copyright (C) 2002-2006, International Business Machines Corporation and others.
3
// All Rights Reserved.
4
//
5
//
6

7 package com.ibm.icu.text;
8 import java.util.HashMap JavaDoc;
9 import java.util.List JavaDoc;
10 import java.util.ArrayList JavaDoc;
11 import java.util.Map JavaDoc;
12 import java.io.OutputStream JavaDoc;
13 import java.io.DataOutputStream JavaDoc;
14 import java.io.IOException JavaDoc;
15 import com.ibm.icu.impl.Assert;
16
17 class RBBIRuleBuilder {
18     // This is the main class for building (compiling) break rules into the tables
19
// required by the runtime RBBI engine.
20
//
21

22     String JavaDoc fDebugEnv; // controls debug trace output
23
String JavaDoc fRules; // The rule string that we are compiling
24
RBBIRuleScanner fScanner; // The scanner.
25

26     
27     //
28
// There are four separate parse trees generated, one for each of the
29
// forward rules, reverse rules, safe forward rules and safe reverse rules.
30
// This array references the root of each of the trees.
31
//
32
RBBINode[] fTreeRoots = new RBBINode[4];
33     static final int fForwardTree = 0; // Indexes into the above fTreeRoots array
34
static final int fReverseTree = 1; // for each of the trees.
35
static final int fSafeFwdTree = 2; // (in C, these are pointer variables and
36
static final int fSafeRevTree = 3; // there is no array.)
37
int fDefaultTree = fForwardTree; // For rules not qualified with a !
38
// the tree to which they belong to.
39

40     boolean fChainRules; // True for chained Unicode TR style rules.
41
// False for traditional regexp rules.
42

43     boolean fLBCMNoChain; // True: suppress chaining of rules on
44
// chars with LineBreak property == CM.
45

46     boolean fLookAheadHardBreak; // True: Look ahead matches cause an
47
// immediate break, no continuing for the
48
// longest match.
49

50     RBBISetBuilder fSetBuilder; // Set and Character Category builder.
51
List JavaDoc fUSetNodes; // Vector of all uset nodes.
52
RBBITableBuilder fForwardTables; // State transition tables
53
RBBITableBuilder fReverseTables;
54     RBBITableBuilder fSafeFwdTables;
55     RBBITableBuilder fSafeRevTables;
56
57     //
58
// Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.).
59
//
60
Map JavaDoc fStatusSets = new HashMap JavaDoc(); // Status value sets encountered so far.
61
// Map Key is the set of values.
62
// Map Value is the runtime array index.
63

64     List JavaDoc fRuleStatusVals; // List of Integer objects. Has same layout as the
65
// runtime array of status (tag) values -
66
// number of values in group 1
67
// first status value in group 1
68
// 2nd status value in group 1
69
// ...
70
// number of values in group 2
71
// first status value in group 2
72
// etc.
73
//
74
// Error codes from ICU4C.
75
// using these simplified the porting, and consolidated the
76
// creation of Java exceptions
77
//
78
static final int U_BRK_ERROR_START = 0x10200;
79     /**< Start of codes indicating Break Iterator failures */
80     
81     static final int U_BRK_INTERNAL_ERROR = 0x10201;
82     /**< An internal error (bug) was detected. */
83     
84     static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202;
85     /**< Hex digits expected as part of a escaped char in a rule. */
86     
87     static final int U_BRK_SEMICOLON_EXPECTED = 0x10203;
88     /**< Missing ';' at the end of a RBBI rule. */
89     
90     static final int U_BRK_RULE_SYNTAX = 0x10204;
91     /**< Syntax error in RBBI rule. */
92     
93     static final int U_BRK_UNCLOSED_SET = 0x10205;
94     /**< UnicodeSet witing an RBBI rule missing a closing ']'. */
95     
96     static final int U_BRK_ASSIGN_ERROR = 0x10206;
97     /**< Syntax error in RBBI rule assignment statement. */
98     
99     static final int U_BRK_VARIABLE_REDFINITION = 0x10207;
100     /**< RBBI rule $Variable redefined. */
101     
102     static final int U_BRK_MISMATCHED_PAREN = 0x10208;
103     /**< Mis-matched parentheses in an RBBI rule. */
104     
105     static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209;
106     /**< Missing closing quote in an RBBI rule. */
107     
108     static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a;
109     /**< Use of an undefined $Variable in an RBBI rule. */
110     
111     static final int U_BRK_INIT_ERROR = 0x1020b;
112     /**< Initialization failure. Probable missing ICU Data. */
113     
114     static final int U_BRK_RULE_EMPTY_SET = 0x1020c;
115     /**< Rule contains an empty Unicode Set. */
116     
117     static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d;
118     /**< !!option in RBBI rules not recognized. */
119     
120     static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e;
121     /**< The {nnn} tag on a rule is mal formed */
122     static final int U_BRK_MALFORMED_SET = 0x1020f;
123     
124     static final int U_BRK_ERROR_LIMIT = 0x10210;
125     /**< This must always be the last value to indicate the limit for Break Iterator failures */
126
127
128     //----------------------------------------------------------------------------------------
129
//
130
// Constructor.
131
//
132
//----------------------------------------------------------------------------------------
133
RBBIRuleBuilder(String JavaDoc rules)
134     {
135         fDebugEnv = System.getProperty("U_RBBIDEBUG");
136         fRules = rules;
137         fUSetNodes = new ArrayList JavaDoc();
138         fRuleStatusVals = new ArrayList JavaDoc();
139         fScanner = new RBBIRuleScanner(this);
140         fSetBuilder = new RBBISetBuilder(this);
141     }
142
143     //----------------------------------------------------------------------------------------
144
//
145
// flattenData() - Collect up the compiled RBBI rule data and put it into
146
// the format for saving in ICU data files,
147
//
148
// See the ICU4C file common/rbidata.h for a detailed description.
149
//
150
//----------------------------------------------------------------------------------------
151
static final int align8(int i)
152     {
153         return (i + 7) & 0xfffffff8;
154     }
155
156     void flattenData(OutputStream JavaDoc os) throws IOException JavaDoc {
157         DataOutputStream JavaDoc dos = new DataOutputStream JavaDoc(os);
158         int i;
159     
160         // Remove comments and whitespace from the rules to make it smaller.
161
String JavaDoc strippedRules = RBBIRuleScanner.stripRules(fRules);
162     
163         // Calculate the size of each section in the data in bytes.
164
// Sizes here are padded up to a multiple of 8 for better memory alignment.
165
// Sections sizes actually stored in the header are for the actual data
166
// without the padding.
167
//
168
int headerSize = 24 * 4; // align8(sizeof(RBBIDataHeader));
169
int forwardTableSize = align8(fForwardTables.getTableSize());
170         int reverseTableSize = align8(fReverseTables.getTableSize());
171         int safeFwdTableSize = align8(fSafeFwdTables.getTableSize());
172         int safeRevTableSize = align8(fSafeRevTables.getTableSize());
173         int trieSize = align8(fSetBuilder.getTrieSize());
174         int statusTableSize = align8(fRuleStatusVals.size() * 4);
175         int rulesSize = align8((strippedRules.length()) * 2);
176         int totalSize = headerSize + forwardTableSize + reverseTableSize
177                                 + safeFwdTableSize + safeRevTableSize
178                                 + statusTableSize + trieSize + rulesSize;
179         int outputPos = 0; // Track stream position, starting from RBBIDataHeader.
180

181         //
182
// Write out an ICU Data Header
183
// TODO: actually create a real header, rather than just a placeholder.
184
// The empty placeholder is ok for compile-and-go from within ICU4J.
185
// Replicating the ICU4C genbrk tool for building .brk resources would need a real header.
186
//
187
byte[] ICUDataHeader = new byte[0x80];
188         dos.write(ICUDataHeader);
189
190         //
191
// Write out the RBBIDataHeader
192
//
193
int[] header = new int[RBBIDataWrapper.DH_SIZE]; // sizeof struct RBBIDataHeader
194
header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0;
195         header[RBBIDataWrapper.DH_FORMATVERSION] = 0x03010000; // uint8_t fFormatVersion[4];
196
header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections.
197
header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount.
198
header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable
199
header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen
200
header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable
201
header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen
202
header[RBBIDataWrapper.DH_SFTABLE] = header[RBBIDataWrapper.DH_RTABLE]
203                                                      + reverseTableSize; // fSTable
204
header[RBBIDataWrapper.DH_SFTABLELEN] = safeFwdTableSize; // fSTableLen
205
header[RBBIDataWrapper.DH_SRTABLE] = header[RBBIDataWrapper.DH_SFTABLE]
206                                                      + safeFwdTableSize; // fSRTable
207
header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize; // fSRTableLen
208
header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_SRTABLE]
209                                                      + safeRevTableSize; // fTrie
210
header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen
211
header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE]
212                                                      + header[RBBIDataWrapper.DH_TRIELEN];
213         header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen
214
header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE]
215                                                      + statusTableSize;
216         header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRules.length() * 2;
217         for (i = 0; i < header.length; i++) {
218             dos.writeInt(header[i]);
219             outputPos += 4;
220         }
221
222         // Write out the actual state tables.
223
short[] tableData;
224         tableData = fForwardTables.exportTable();
225         Assert.assrt(outputPos == header[4]);
226         for (i = 0; i < tableData.length; i++) {
227             dos.writeShort(tableData[i]);
228             outputPos += 2;
229         }
230
231         tableData = fReverseTables.exportTable();
232         Assert.assrt(outputPos == header[6]);
233         for (i = 0; i < tableData.length; i++) {
234             dos.writeShort(tableData[i]);
235             outputPos += 2;
236         }
237
238         Assert.assrt(outputPos == header[8]);
239         tableData = fSafeFwdTables.exportTable();
240         for (i = 0; i < tableData.length; i++) {
241             dos.writeShort(tableData[i]);
242             outputPos += 2;
243         }
244
245         Assert.assrt(outputPos == header[10]);
246         tableData = fSafeRevTables.exportTable();
247         for (i = 0; i < tableData.length; i++) {
248             dos.writeShort(tableData[i]);
249             outputPos += 2;
250         }
251
252         // write out the Trie table
253
Assert.assrt(outputPos == header[12]);
254         fSetBuilder.serializeTrie(os);
255         outputPos += header[13];
256         while (outputPos % 8 != 0) { // pad to an 8 byte boundary
257
dos.write(0);
258             outputPos += 1;
259         }
260
261         // Write out the status {tag} table.
262
Assert.assrt(outputPos == header[16]);
263         for (i = 0; i < fRuleStatusVals.size(); i++) {
264             Integer JavaDoc val = (Integer JavaDoc) fRuleStatusVals.get(i);
265             dos.writeInt(val.intValue());
266             outputPos += 4;
267         }
268
269         while (outputPos % 8 != 0) { // pad to an 8 byte boundary
270
dos.write(0);
271             outputPos += 1;
272         }
273
274         // Write out the stripped rules (rules with extra spaces removed
275
// These go last in the data area, even though they are not last in the header.
276
Assert.assrt(outputPos == header[14]);
277         dos.writeChars(strippedRules);
278         outputPos += strippedRules.length() * 2;
279         while (outputPos % 8 != 0) { // pad to an 8 byte boundary
280
dos.write(0);
281             outputPos += 1;
282         }
283     }
284
285     //----------------------------------------------------------------------------------------
286
//
287
// compileRules compile source rules, placing the compiled form into a output stream
288
// The compiled form is identical to that from ICU4C (Big Endian).
289
//
290
//----------------------------------------------------------------------------------------
291
static void compileRules(String JavaDoc rules, OutputStream JavaDoc os) throws IOException JavaDoc
292     {
293         //
294
// Read the input rules, generate a parse tree, symbol table,
295
// and list of all Unicode Sets referenced by the rules.
296
//
297
RBBIRuleBuilder builder = new RBBIRuleBuilder(rules);
298         builder.fScanner.parse();
299
300         //
301
// UnicodeSet processing.
302
// Munge the Unicode Sets to create a set of character categories.
303
// Generate the mapping tables (TRIE) from input 32-bit characters to
304
// the character categories.
305
//
306
builder.fSetBuilder.build();
307
308         //
309
// Generate the DFA state transition table.
310
//
311
builder.fForwardTables = new RBBITableBuilder(builder, fForwardTree);
312         builder.fReverseTables = new RBBITableBuilder(builder, fReverseTree);
313         builder.fSafeFwdTables = new RBBITableBuilder(builder, fSafeFwdTree);
314         builder.fSafeRevTables = new RBBITableBuilder(builder, fSafeRevTree);
315         builder.fForwardTables.build();
316         builder.fReverseTables.build();
317         builder.fSafeFwdTables.build();
318         builder.fSafeRevTables.build();
319         if (builder.fDebugEnv != null
320                 && builder.fDebugEnv.indexOf("states") >= 0) {
321             builder.fForwardTables.printRuleStatusTable();
322         }
323
324         //
325
// Package up the compiled data, writing it to an output stream
326
// in the serialization format. This is the same as the ICU4C runtime format.
327
//
328
builder.flattenData(os);
329     }
330 }
331
Popular Tags