KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > java > text > BreakDictionary


1 /*
2  * @(#)BreakDictionary.java 1.14 03/12/19
3  *
4  * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
5  * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6  */

7
8 /*
9  * @(#)BreakDictionary.java 1.3 99/04/07
10  *
11  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
12  * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
13  *
14  * The original version of this source code and documentation
15  * is copyrighted and owned by Taligent, Inc., a wholly-owned
16  * subsidiary of IBM. These materials are provided under terms
17  * of a License Agreement between Taligent and Sun. This technology
18  * is protected by multiple US and International patents.
19  *
20  * This notice and attribution to Taligent may not be removed.
21  * Taligent is a registered trademark of Taligent, Inc.
22  */

23 package java.text;
24
25 import java.io.*;
26 import java.security.AccessController JavaDoc;
27 import java.security.PrivilegedActionException JavaDoc;
28 import java.security.PrivilegedExceptionAction JavaDoc;
29 import java.util.MissingResourceException JavaDoc;
30 import sun.text.CompactByteArray;
31 import sun.text.SupplementaryCharacterData;
32
33 /**
34  * This is the class that represents the list of known words used by
35  * DictionaryBasedBreakIterator. The conceptual data structure used
36  * here is a trie: there is a node hanging off the root node for every
37  * letter that can start a word. Each of these nodes has a node hanging
38  * off of it for every letter that can be the second letter of a word
39  * if this node is the first letter, and so on. The trie is represented
40  * as a two-dimensional array that can be treated as a table of state
41  * transitions. Indexes are used to compress this array, taking
42  * advantage of the fact that this array will always be very sparse.
43  */

44 class BreakDictionary {
45
46     //=========================================================================
47
// data members
48
//=========================================================================
49

50     /**
51       * The version of the dictionary that was read in.
52       */

53     private static int supportedVersion = 1;
54
55     /**
56      * Maps from characters to column numbers. The main use of this is to
57      * avoid making room in the array for empty columns.
58      */

59     private CompactByteArray columnMap = null;
60     private SupplementaryCharacterData supplementaryCharColumnMap = null;
61
62     /**
63      * The number of actual columns in the table
64      */

65     private int numCols;
66
67     /**
68      * Columns are organized into groups of 32. This says how many
69      * column groups. (We could calculate this, but we store the
70      * value to avoid having to repeatedly calculate it.)
71      */

72     private int numColGroups;
73
74     /**
75      * The actual compressed state table. Each conceptual row represents
76      * a state, and the cells in it contain the row numbers of the states
77      * to transition to for each possible letter. 0 is used to indicate
78      * an illegal combination of letters (i.e., the error state). The
79      * table is compressed by eliminating all the unpopulated (i.e., zero)
80      * cells. Multiple conceptual rows can then be doubled up in a single
81      * physical row by sliding them up and possibly shifting them to one
82      * side or the other so the populated cells don't collide. Indexes
83      * are used to identify unpopulated cells and to locate populated cells.
84      */

85     private short[] table = null;
86
87     /**
88      * This index maps logical row numbers to physical row numbers
89      */

90     private short[] rowIndex = null;
91
92     /**
93      * A bitmap is used to tell which cells in the comceptual table are
94      * populated. This array contains all the unique bit combinations
95      * in that bitmap. If the table is more than 32 columns wide,
96      * successive entries in this array are used for a single row.
97      */

98     private int[] rowIndexFlags = null;
99
100     /**
101      * This index maps from a logical row number into the bitmap table above.
102      * (This keeps us from storing duplicate bitmap combinations.) Since there
103      * are a lot of rows with only one populated cell, instead of wasting space
104      * in the bitmap table, we just store a negative number in this index for
105      * rows with one populated cell. The absolute value of that number is
106      * the column number of the populated cell.
107      */

108     private short[] rowIndexFlagsIndex = null;
109
110     /**
111      * For each logical row, this index contains a constant that is added to
112      * the logical column number to get the physical column number
113      */

114     private byte[] rowIndexShifts = null;
115
116     //=========================================================================
117
// deserialization
118
//=========================================================================
119

120     public BreakDictionary(String JavaDoc dictionaryName)
121         throws IOException, MissingResourceException JavaDoc {
122
123         readDictionaryFile(dictionaryName);
124     }
125
126     private void readDictionaryFile(final String JavaDoc dictionaryName)
127         throws IOException, MissingResourceException JavaDoc {
128
129         BufferedInputStream in;
130         try {
131             in = (BufferedInputStream)AccessController.doPrivileged(
132                 new PrivilegedExceptionAction JavaDoc() {
133                     public Object JavaDoc run() throws Exception JavaDoc {
134                         return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + dictionaryName));
135                     }
136                 }
137             );
138         }
139         catch (PrivilegedActionException JavaDoc e) {
140             throw new InternalError JavaDoc(e.toString());
141         }
142  
143         byte[] buf = new byte[8];
144         if (in.read(buf) != 8) {
145             throw new MissingResourceException JavaDoc("Wrong data length",
146                                                dictionaryName, "");
147         }
148
149         // check vesion
150
int version = BreakIterator.getInt(buf, 0);
151         if (version != supportedVersion) {
152             throw new MissingResourceException JavaDoc("Dictionary version(" + version + ") is unsupported",
153                                                            dictionaryName, "");
154         }
155
156         // get data size
157
int len = BreakIterator.getInt(buf, 4);
158         buf = new byte[len];
159         if (in.read(buf) != len) {
160             throw new MissingResourceException JavaDoc("Wrong data length",
161                                                dictionaryName, "");
162         }
163
164         // close the stream
165
in.close();
166
167         int l;
168         int offset = 0;
169
170         // read in the column map for BMP characteres (this is serialized in
171
// its internal form: an index array followed by a data array)
172
l = BreakIterator.getInt(buf, offset);
173         offset += 4;
174         short[] temp = new short[l];
175         for (int i = 0; i < l; i++, offset+=2) {
176             temp[i] = BreakIterator.getShort(buf, offset);
177         }
178         l = BreakIterator.getInt(buf, offset);
179         offset += 4;
180         byte[] temp2 = new byte[l];
181         for (int i = 0; i < l; i++, offset++) {
182             temp2[i] = buf[offset];
183         }
184         columnMap = new CompactByteArray(temp, temp2);
185
186         // read in numCols and numColGroups
187
numCols = BreakIterator.getInt(buf, offset);
188         offset += 4;
189         numColGroups = BreakIterator.getInt(buf, offset);
190         offset += 4;
191
192         // read in the row-number index
193
l = BreakIterator.getInt(buf, offset);
194         offset += 4;
195         rowIndex = new short[l];
196         for (int i = 0; i < l; i++, offset+=2) {
197             rowIndex[i] = BreakIterator.getShort(buf, offset);
198         }
199
200         // load in the populated-cells bitmap: index first, then bitmap list
201
l = BreakIterator.getInt(buf, offset);
202         offset += 4;
203         rowIndexFlagsIndex = new short[l];
204         for (int i = 0; i < l; i++, offset+=2) {
205             rowIndexFlagsIndex[i] = BreakIterator.getShort(buf, offset);
206         }
207         l = BreakIterator.getInt(buf, offset);
208         offset += 4;
209         rowIndexFlags = new int[l];
210         for (int i = 0; i < l; i++, offset+=4) {
211             rowIndexFlags[i] = BreakIterator.getInt(buf, offset);
212         }
213
214         // load in the row-shift index
215
l = BreakIterator.getInt(buf, offset);
216         offset += 4;
217         rowIndexShifts = new byte[l];
218         for (int i = 0; i < l; i++, offset++) {
219             rowIndexShifts[i] = buf[offset];
220         }
221
222         // load in the actual state table
223
l = BreakIterator.getInt(buf, offset);
224         offset += 4;
225         table = new short[l];
226         for (int i = 0; i < l; i++, offset+=2) {
227             table[i] = BreakIterator.getShort(buf, offset);
228         }
229
230         // finally, prepare the column map for supplementary characters
231
l = BreakIterator.getInt(buf, offset);
232         offset += 4;
233         int[] temp3 = new int[l];
234         for (int i = 0; i < l; i++, offset+=4) {
235             temp3[i] = BreakIterator.getInt(buf, offset);
236         }
237         supplementaryCharColumnMap = new SupplementaryCharacterData(temp3);
238     }
239
240     //=========================================================================
241
// access to the words
242
//=========================================================================
243

244     /**
245      * Uses the column map to map the character to a column number, then
246      * passes the row and column number to getNextState()
247      * @param row The current state
248      * @param ch The character whose column we're interested in
249      * @return The new state to transition to
250      */

251     public final short getNextStateFromCharacter(int row, int ch) {
252         int col;
253         if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
254             col = columnMap.elementAt((char)ch);
255         } else {
256             col = supplementaryCharColumnMap.getValue(ch);
257         }
258         return getNextState(row, col);
259     }
260
261     /**
262      * Returns the value in the cell with the specified (logical) row and
263      * column numbers. In DictionaryBasedBreakIterator, the row number is
264      * a state number, the column number is an input, and the return value
265      * is the row number of the new state to transition to. (0 is the
266      * "error" state, and -1 is the "end of word" state in a dictionary)
267      * @param row The row number of the current state
268      * @param col The column number of the input character (0 means "not a
269      * dictionary character")
270      * @return The row number of the new state to transition to
271      */

272     public final short getNextState(int row, int col) {
273         if (cellIsPopulated(row, col)) {
274             // we map from logical to physical row number by looking up the
275
// mapping in rowIndex; we map from logical column number to
276
// physical column number by looking up a shift value for this
277
// logical row and offsetting the logical column number by
278
// the shift amount. Then we can use internalAt() to actually
279
// get the value out of the table.
280
return internalAt(rowIndex[row], col + rowIndexShifts[row]);
281         }
282         else {
283             return 0;
284         }
285     }
286
287     /**
288      * Given (logical) row and column numbers, returns true if the
289      * cell in that position is populated
290      */

291     private final boolean cellIsPopulated(int row, int col) {
292         // look up the entry in the bitmap index for the specified row.
293
// If it's a negative number, it's the column number of the only
294
// populated cell in the row
295
if (rowIndexFlagsIndex[row] < 0) {
296             return col == -rowIndexFlagsIndex[row];
297         }
298
299         // if it's a positive number, it's the offset of an entry in the bitmap
300
// list. If the table is more than 32 columns wide, the bitmap is stored
301
// successive entries in the bitmap list, so we have to divide the column
302
// number by 32 and offset the number we got out of the index by the result.
303
// Once we have the appropriate piece of the bitmap, test the appropriate
304
// bit and return the result.
305
else {
306             int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
307             return (flags & (1 << (col & 0x1f))) != 0;
308         }
309     }
310
311     /**
312      * Implementation of getNextState() when we know the specified cell is
313      * populated.
314      * @param row The PHYSICAL row number of the cell
315      * @param col The PHYSICAL column number of the cell
316      * @return The value stored in the cell
317      */

318     private final short internalAt(int row, int col) {
319         // the table is a one-dimensional array, so this just does the math necessary
320
// to treat it as a two-dimensional array (we don't just use a two-dimensional
321
// array because two-dimensional arrays are inefficient in Java)
322
return table[row * numCols + col];
323     }
324 }
325
326
Popular Tags