KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > BreakDictionary


1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2006, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7 package com.ibm.icu.text;
8
9 import java.io.InputStream JavaDoc;
10 import java.io.DataInputStream JavaDoc;
11 import java.io.FileNotFoundException JavaDoc;
12 import java.io.UnsupportedEncodingException JavaDoc;
13 import java.io.IOException JavaDoc;
14 import java.io.FileInputStream JavaDoc;
15 import java.io.OutputStreamWriter JavaDoc;
16 import java.io.PrintWriter JavaDoc;
17 import java.io.FileOutputStream JavaDoc;
18
19 import com.ibm.icu.util.CompactByteArray;
20
21 /**
22  * This is the class that represents the list of known words used by
23  * DictionaryBasedBreakIterator. The conceptual data structure used
24  * here is a trie: there is a node hanging off the root node for every
25  * letter that can start a word. Each of these nodes has a node hanging
26  * off of it for every letter that can be the second letter of a word
27  * if this node is the first letter, and so on. The trie is represented
28  * as a two-dimensional array that can be treated as a table of state
29  * transitions. Indexes are used to compress this array, taking
30  * advantage of the fact that this array will always be very sparse.
31  * @internal
32  * @deprecated This API is ICU internal only.
33  */

34 public class BreakDictionary {
35     //=================================================================================
36
// testing and debugging
37
//=================================================================================
38
/**
39      * @internal
40      * @deprecated This API is ICU internal only.
41      */

42     public static void main(String JavaDoc args[])
43             throws FileNotFoundException JavaDoc, UnsupportedEncodingException JavaDoc, IOException JavaDoc {
44         String JavaDoc filename = args[0];
45
46         BreakDictionary dictionary = new BreakDictionary(new FileInputStream JavaDoc(filename));
47
48         PrintWriter JavaDoc out = null;
49
50         if(args.length >= 2) {
51             out = new PrintWriter JavaDoc(new OutputStreamWriter JavaDoc(new FileOutputStream JavaDoc(args[1]), "UnicodeLittle"));
52         }
53
54         dictionary.printWordList("", 0, out);
55
56         if (out != null) {
57             out.close();
58         }
59     }
60
61     /**
62      * @internal
63      * @deprecated This API is ICU internal only.
64      */

65     public void printWordList(String JavaDoc partialWord, int state, PrintWriter JavaDoc out)
66             throws IOException JavaDoc {
67         if (state == 0xFFFF) {
68             System.out.println(partialWord);
69             if (out != null) {
70                 out.println(partialWord);
71             }
72         }
73         else {
74             for (int i = 0; i < numCols; i++) {
75                 int newState = (at(state, i)) & 0xFFFF;
76
77                 if (newState != 0) {
78                     char newChar = reverseColumnMap[i];
79                     String JavaDoc newPartialWord = partialWord;
80
81                     if (newChar != 0) {
82                         newPartialWord += newChar;
83                     }
84
85                     printWordList(newPartialWord, newState, out);
86                 }
87             }
88         }
89     }
90
91     /**
92      * A map used to go from column numbers to characters. Used only
93      * for debugging right now.
94      */

95     private char[] reverseColumnMap = null;
96
97     //=================================================================================
98
// data members
99
//=================================================================================
100

101     /**
102      * Maps from characters to column numbers. The main use of this is to
103      * avoid making room in the array for empty columns.
104      */

105     private CompactByteArray columnMap = null;
106
107     /**
108      * The number of actual columns in the table
109      */

110     private int numCols;
111
112     /**
113      * Columns are organized into groups of 32. This says how many
114      * column groups. (We could calculate this, but we store the
115      * value to avoid having to repeatedly calculate it.)
116      */

117     private int numColGroups;
118
119     /**
120      * The actual compressed state table. Each conceptual row represents
121      * a state, and the cells in it contain the row numbers of the states
122      * to transition to for each possible letter. 0 is used to indicate
123      * an illegal combination of letters (i.e., the error state). The
124      * table is compressed by eliminating all the unpopulated (i.e., zero)
125      * cells. Multiple conceptual rows can then be doubled up in a single
126      * physical row by sliding them up and possibly shifting them to one
127      * side or the other so the populated cells don't collide. Indexes
128      * are used to identify unpopulated cells and to locate populated cells.
129      */

130     private short[] table = null;
131
132     /**
133      * This index maps logical row numbers to physical row numbers
134      */

135     private short[] rowIndex = null;
136
137     /**
138      * A bitmap is used to tell which cells in the comceptual table are
139      * populated. This array contains all the unique bit combinations
140      * in that bitmap. If the table is more than 32 columns wide,
141      * successive entries in this array are used for a single row.
142      */

143     private int[] rowIndexFlags = null;
144
145     /**
146      * This index maps from a logical row number into the bitmap table above.
147      * (This keeps us from storing duplicate bitmap combinations.) Since there
148      * are a lot of rows with only one populated cell, instead of wasting space
149      * in the bitmap table, we just store a negative number in this index for
150      * rows with one populated cell. The absolute value of that number is
151      * the column number of the populated cell.
152      */

153     private short[] rowIndexFlagsIndex = null;
154
155     /**
156      * For each logical row, this index contains a constant that is added to
157      * the logical column number to get the physical column number
158      */

159     private byte[] rowIndexShifts = null;
160
161     //=================================================================================
162
// deserialization
163
//=================================================================================
164

165     /**
166      * @internal
167      * @deprecated This API is ICU internal only.
168      */

169     public BreakDictionary(InputStream JavaDoc dictionaryStream) throws IOException JavaDoc {
170         readDictionaryFile(new DataInputStream JavaDoc(dictionaryStream));
171     }
172
173     /**
174      * @internal
175      * @deprecated This API is ICU internal only.
176      */

177     public void readDictionaryFile(DataInputStream JavaDoc in) throws IOException JavaDoc {
178         int l;
179
180         // read in the version number (right now we just ignore it)
181
in.readInt();
182
183         // read in the column map (this is serialized in its internal form:
184
// an index array followed by a data array)
185
l = in.readInt();
186         char[] temp = new char[l];
187         for (int i = 0; i < temp.length; i++)
188             temp[i] = (char)in.readShort();
189         l = in.readInt();
190         byte[] temp2 = new byte[l];
191         for (int i = 0; i < temp2.length; i++)
192             temp2[i] = in.readByte();
193         columnMap = new CompactByteArray(temp, temp2);
194
195         // read in numCols and numColGroups
196
numCols = in.readInt();
197         numColGroups = in.readInt();
198
199         // read in the row-number index
200
l = in.readInt();
201         rowIndex = new short[l];
202         for (int i = 0; i < rowIndex.length; i++)
203             rowIndex[i] = in.readShort();
204
205         // load in the populated-cells bitmap: index first, then bitmap list
206
l = in.readInt();
207         rowIndexFlagsIndex = new short[l];
208         for (int i = 0; i < rowIndexFlagsIndex.length; i++)
209             rowIndexFlagsIndex[i] = in.readShort();
210         l = in.readInt();
211         rowIndexFlags = new int[l];
212         for (int i = 0; i < rowIndexFlags.length; i++)
213             rowIndexFlags[i] = in.readInt();
214
215         // load in the row-shift index
216
l = in.readInt();
217         rowIndexShifts = new byte[l];
218         for (int i = 0; i < rowIndexShifts.length; i++)
219             rowIndexShifts[i] = in.readByte();
220
221         // finally, load in the actual state table
222
l = in.readInt();
223         table = new short[l];
224         for (int i = 0; i < table.length; i++)
225             table[i] = in.readShort();
226
227         // this data structure is only necessary for testing and debugging purposes
228
reverseColumnMap = new char[numCols];
229         for (char c = 0; c < 0xffff; c++) {
230             int col = columnMap.elementAt(c);
231             if (col != 0) {
232                reverseColumnMap[col] = c;
233             }
234         }
235
236         // close the stream
237
in.close();
238     }
239
240     //=================================================================================
241
// access to the words
242
//=================================================================================
243

244     /**
245      * Uses the column map to map the character to a column number, then
246      * passes the row and column number to the other version of at()
247      * @param row The current state
248      * @param ch The character whose column we're interested in
249      * @return The new state to transition to
250      * @internal
251      * @deprecated This API is ICU internal only.
252      */

253     public final short at(int row, char ch) {
254         int col = columnMap.elementAt(ch);
255         return at(row, col);
256     }
257
258     /**
259      * Returns the value in the cell with the specified (logical) row and
260      * column numbers. In DictionaryBasedBreakIterator, the row number is
261      * a state number, the column number is an input, and the return value
262      * is the row number of the new state to transition to. (0 is the
263      * "error" state, and -1 is the "end of word" state in a dictionary)
264      * @param row The row number of the current state
265      * @param col The column number of the input character (0 means "not a
266      * dictionary character")
267      * @return The row number of the new state to transition to
268      * @internal
269      * @deprecated This API is ICU internal only.
270      */

271     public final short at(int row, int col) {
272         if (cellIsPopulated(row, col)) {
273             // we map from logical to physical row number by looking up the
274
// mapping in rowIndex; we map from logical column number to
275
// physical column number by looking up a shift value for this
276
// logical row and offsetting the logical column number by
277
// the shift amount. Then we can use internalAt() to actually
278
// get the value out of the table.
279
return internalAt(rowIndex[row], col + rowIndexShifts[row]);
280         }
281         else {
282             return 0;
283         }
284     }
285
286     /**
287      * Given (logical) row and column numbers, returns true if the
288      * cell in that position is populated
289      */

290     private final boolean cellIsPopulated(int row, int col) {
291         // look up the entry in the bitmap index for the specified row.
292
// If it's a negative number, it's the column number of the only
293
// populated cell in the row
294
if (rowIndexFlagsIndex[row] < 0) {
295             return col == -rowIndexFlagsIndex[row];
296         }
297
298         // if it's a positive number, it's the offset of an entry in the bitmap
299
// list. If the table is more than 32 columns wide, the bitmap is stored
300
// successive entries in the bitmap list, so we have to divide the column
301
// number by 32 and offset the number we got out of the index by the result.
302
// Once we have the appropriate piece of the bitmap, test the appropriate
303
// bit and return the result.
304
else {
305             int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
306             return (flags & (1 << (col & 0x1f))) != 0;
307         }
308     }
309
310     /**
311      * Implementation of at() when we know the specified cell is populated.
312      * @param row The PHYSICAL row number of the cell
313      * @param col The PHYSICAL column number of the cell
314      * @return The value stored in the cell
315      */

316     private final short internalAt(int row, int col) {
317         // the table is a one-dimensional array, so this just does the math necessary
318
// to treat it as a two-dimensional array (we don't just use a two-dimensional
319
// array because two-dimensional arrays are inefficient in Java)
320
return table[row * numCols + col];
321     }
322 }
323
324
Popular Tags