KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > lowagie > text > pdf > hyphenation > HyphenationTree


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 /* $Id: HyphenationTree.java 2623 2007-02-23 22:28:28Z xlv $ */
18  
19 package com.lowagie.text.pdf.hyphenation;
20
21 import java.io.InputStream JavaDoc;
22 import java.util.ArrayList JavaDoc;
23 import java.util.HashMap JavaDoc;
24
25 /**
26  * This tree structure stores the hyphenation patterns in an efficient
27  * way for fast lookup. It provides the provides the method to
28  * hyphenate a word.
29  *
30  * @author Carlos Villegas <cav@uniscope.co.jp>
31  */

32 public class HyphenationTree extends TernaryTree
33             implements PatternConsumer {
34
35     private static final long serialVersionUID = -7763254239309429432L;
36
37     /**
38      * value space: stores the inteletter values
39      */

40     protected ByteVector vspace;
41
42     /**
43      * This map stores hyphenation exceptions
44      */

45     protected HashMap JavaDoc stoplist;
46
47     /**
48      * This map stores the character classes
49      */

50     protected TernaryTree classmap;
51
52     /**
53      * Temporary map to store interletter values on pattern loading.
54      */

55     private transient TernaryTree ivalues;
56
57     public HyphenationTree() {
58         stoplist = new HashMap JavaDoc(23); // usually a small table
59
classmap = new TernaryTree();
60         vspace = new ByteVector();
61         vspace.alloc(1); // this reserves index 0, which we don't use
62
}
63
64     /**
65      * Packs the values by storing them in 4 bits, two values into a byte
66      * Values range is from 0 to 9. We use zero as terminator,
67      * so we'll add 1 to the value.
68      * @param values a string of digits from '0' to '9' representing the
69      * interletter values.
70      * @return the index into the vspace array where the packed values
71      * are stored.
72      */

73     protected int packValues(String JavaDoc values) {
74         int i, n = values.length();
75         int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
76         int offset = vspace.alloc(m);
77         byte[] va = vspace.getArray();
78         for (i = 0; i < n; i++) {
79             int j = i >> 1;
80             byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f);
81             if ((i & 1) == 1) {
82                 va[j + offset] = (byte)(va[j + offset] | v);
83             } else {
84                 va[j + offset] = (byte)(v << 4); // big endian
85
}
86         }
87         va[m - 1 + offset] = 0; // terminator
88
return offset;
89     }
90
91     protected String JavaDoc unpackValues(int k) {
92         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
93         byte v = vspace.get(k++);
94         while (v != 0) {
95             char c = (char)((v >>> 4) - 1 + '0');
96             buf.append(c);
97             c = (char)(v & 0x0f);
98             if (c == 0) {
99                 break;
100             }
101             c = (char)(c - 1 + '0');
102             buf.append(c);
103             v = vspace.get(k++);
104         }
105         return buf.toString();
106     }
107
108     public void loadSimplePatterns(InputStream JavaDoc stream) {
109         SimplePatternParser pp = new SimplePatternParser();
110         ivalues = new TernaryTree();
111
112         pp.parse(stream, this);
113
114         // patterns/values should be now in the tree
115
// let's optimize a bit
116
trimToSize();
117         vspace.trimToSize();
118         classmap.trimToSize();
119
120         // get rid of the auxiliary map
121
ivalues = null;
122     }
123
124
125     public String JavaDoc findPattern(String JavaDoc pat) {
126         int k = super.find(pat);
127         if (k >= 0) {
128             return unpackValues(k);
129         }
130         return "";
131     }
132
133     /**
134      * String compare, returns 0 if equal or
135      * t is a substring of s
136      */

137     protected int hstrcmp(char[] s, int si, char[] t, int ti) {
138         for (; s[si] == t[ti]; si++, ti++) {
139             if (s[si] == 0) {
140                 return 0;
141             }
142         }
143         if (t[ti] == 0) {
144             return 0;
145         }
146         return s[si] - t[ti];
147     }
148
149     protected byte[] getValues(int k) {
150         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
151         byte v = vspace.get(k++);
152         while (v != 0) {
153             char c = (char)((v >>> 4) - 1);
154             buf.append(c);
155             c = (char)(v & 0x0f);
156             if (c == 0) {
157                 break;
158             }
159             c = (char)(c - 1);
160             buf.append(c);
161             v = vspace.get(k++);
162         }
163         byte[] res = new byte[buf.length()];
164         for (int i = 0; i < res.length; i++) {
165             res[i] = (byte)buf.charAt(i);
166         }
167         return res;
168     }
169
170     /**
171      * <p>Search for all possible partial matches of word starting
172      * at index an update interletter values. In other words, it
173      * does something like:</p>
174      * <code>
175      * for(i=0; i<patterns.length; i++) {
176      * if ( word.substring(index).startsWidth(patterns[i]) )
177      * update_interletter_values(patterns[i]);
178      * }
179      * </code>
180      * <p>But it is done in an efficient way since the patterns are
181      * stored in a ternary tree. In fact, this is the whole purpose
182      * of having the tree: doing this search without having to test
183      * every single pattern. The number of patterns for languages
184      * such as English range from 4000 to 10000. Thus, doing thousands
185      * of string comparisons for each word to hyphenate would be
186      * really slow without the tree. The tradeoff is memory, but
187      * using a ternary tree instead of a trie, almost halves the
188      * the memory used by Lout or TeX. It's also faster than using
189      * a hash table</p>
190      * @param word null terminated word to match
191      * @param index start index from word
192      * @param il interletter values array to update
193      */

194     protected void searchPatterns(char[] word, int index, byte[] il) {
195         byte[] values;
196         int i = index;
197         char p, q;
198         char sp = word[i];
199         p = root;
200
201         while (p > 0 && p < sc.length) {
202             if (sc[p] == 0xFFFF) {
203                 if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
204                     values = getValues(eq[p]); // data pointer is in eq[]
205
int j = index;
206                     for (int k = 0; k < values.length; k++) {
207                         if (j < il.length && values[k] > il[j]) {
208                             il[j] = values[k];
209                         }
210                         j++;
211                     }
212                 }
213                 return;
214             }
215             int d = sp - sc[p];
216             if (d == 0) {
217                 if (sp == 0) {
218                     break;
219                 }
220                 sp = word[++i];
221                 p = eq[p];
222                 q = p;
223
224                 // look for a pattern ending at this position by searching for
225
// the null char ( splitchar == 0 )
226
while (q > 0 && q < sc.length) {
227                     if (sc[q] == 0xFFFF) { // stop at compressed branch
228
break;
229                     }
230                     if (sc[q] == 0) {
231                         values = getValues(eq[q]);
232                         int j = index;
233                         for (int k = 0; k < values.length; k++) {
234                             if (j < il.length && values[k] > il[j]) {
235                                 il[j] = values[k];
236                             }
237                             j++;
238                         }
239                         break;
240                     } else {
241                         q = lo[q];
242
243                         /**
244                          * actually the code should be:
245                          * q = sc[q] < 0 ? hi[q] : lo[q];
246                          * but java chars are unsigned
247                          */

248                     }
249                 }
250             } else {
251                 p = d < 0 ? lo[p] : hi[p];
252             }
253         }
254     }
255
256     /**
257      * Hyphenate word and return a Hyphenation object.
258      * @param word the word to be hyphenated
259      * @param remainCharCount Minimum number of characters allowed
260      * before the hyphenation point.
261      * @param pushCharCount Minimum number of characters allowed after
262      * the hyphenation point.
263      * @return a {@link Hyphenation Hyphenation} object representing
264      * the hyphenated word or null if word is not hyphenated.
265      */

266     public Hyphenation hyphenate(String JavaDoc word, int remainCharCount,
267                                  int pushCharCount) {
268         char[] w = word.toCharArray();
269         return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
270     }
271
272     /**
273      * w = "****nnllllllnnn*****",
274      * where n is a non-letter, l is a letter,
275      * all n may be absent, the first n is at offset,
276      * the first l is at offset + iIgnoreAtBeginning;
277      * word = ".llllll.'\0'***",
278      * where all l in w are copied into word.
279      * In the first part of the routine len = w.length,
280      * in the second part of the routine len = word.length.
281      * Three indices are used:
282      * index(w), the index in w,
283      * index(word), the index in word,
284      * letterindex(word), the index in the letter part of word.
285      * The following relations exist:
286      * index(w) = offset + i - 1
287      * index(word) = i - iIgnoreAtBeginning
288      * letterindex(word) = index(word) - 1
289      * (see first loop).
290      * It follows that:
291      * index(w) - index(word) = offset - 1 + iIgnoreAtBeginning
292      * index(w) = letterindex(word) + offset + iIgnoreAtBeginning
293      */

294
295     /**
296      * Hyphenate word and return an array of hyphenation points.
297      * @param w char array that contains the word
298      * @param offset Offset to first character in word
299      * @param len Length of word
300      * @param remainCharCount Minimum number of characters allowed
301      * before the hyphenation point.
302      * @param pushCharCount Minimum number of characters allowed after
303      * the hyphenation point.
304      * @return a {@link Hyphenation Hyphenation} object representing
305      * the hyphenated word or null if word is not hyphenated.
306      */

307     public Hyphenation hyphenate(char[] w, int offset, int len,
308                                  int remainCharCount, int pushCharCount) {
309         int i;
310         char[] word = new char[len + 3];
311
312         // normalize word
313
char[] c = new char[2];
314         int iIgnoreAtBeginning = 0;
315         int iLength = len;
316         boolean bEndOfLetters = false;
317         for (i = 1; i <= len; i++) {
318             c[0] = w[offset + i - 1];
319             int nc = classmap.find(c, 0);
320             if (nc < 0) { // found a non-letter character ...
321
if (i == (1 + iIgnoreAtBeginning)) {
322                     // ... before any letter character
323
iIgnoreAtBeginning ++;
324                 } else {
325                     // ... after a letter character
326
bEndOfLetters = true;
327                 }
328                 iLength --;
329             } else {
330                 if (!bEndOfLetters) {
331                     word[i - iIgnoreAtBeginning] = (char)nc;
332                 } else {
333                     return null;
334                 }
335             }
336         }
337         len = iLength;
338         if (len < (remainCharCount + pushCharCount)) {
339             // word is too short to be hyphenated
340
return null;
341         }
342         int[] result = new int[len + 1];
343         int k = 0;
344
345         // check exception list first
346
String JavaDoc sw = new String JavaDoc(word, 1, len);
347         if (stoplist.containsKey(sw)) {
348             // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null)
349
ArrayList JavaDoc hw = (ArrayList JavaDoc)stoplist.get(sw);
350             int j = 0;
351             for (i = 0; i < hw.size(); i++) {
352                 Object JavaDoc o = hw.get(i);
353                 // j = index(sw) = letterindex(word)?
354
// result[k] = corresponding index(w)
355
if (o instanceof String JavaDoc) {
356                     j += ((String JavaDoc)o).length();
357                     if (j >= remainCharCount && j < (len - pushCharCount)) {
358                         result[k++] = j + iIgnoreAtBeginning;
359                     }
360                 }
361             }
362         } else {
363             // use algorithm to get hyphenation points
364
word[0] = '.'; // word start marker
365
word[len + 1] = '.'; // word end marker
366
word[len + 2] = 0; // null terminated
367
byte[] il = new byte[len + 3]; // initialized to zero
368
for (i = 0; i < len + 1; i++) {
369                 searchPatterns(word, i, il);
370             }
371
372             // hyphenation points are located where interletter value is odd
373
// i is letterindex(word),
374
// i + 1 is index(word),
375
// result[k] = corresponding index(w)
376
for (i = 0; i < len; i++) {
377                 if (((il[i + 1] & 1) == 1) && i >= remainCharCount
378                         && i <= (len - pushCharCount)) {
379                     result[k++] = i + iIgnoreAtBeginning;
380                 }
381             }
382         }
383
384
385         if (k > 0) {
386             // trim result array
387
int[] res = new int[k];
388             System.arraycopy(result, 0, res, 0, k);
389             return new Hyphenation(new String JavaDoc(w, offset, len), res);
390         } else {
391             return null;
392         }
393     }
394
395     /**
396      * Add a character class to the tree. It is used by
397      * {@link SimplePatternParser SimplePatternParser} as callback to
398      * add character classes. Character classes define the
399      * valid word characters for hyphenation. If a word contains
400      * a character not defined in any of the classes, it is not hyphenated.
401      * It also defines a way to normalize the characters in order
402      * to compare them with the stored patterns. Usually pattern
403      * files use only lower case characters, in this case a class
404      * for letter 'a', for example, should be defined as "aA", the first
405      * character being the normalization char.
406      */

407     public void addClass(String JavaDoc chargroup) {
408         if (chargroup.length() > 0) {
409             char equivChar = chargroup.charAt(0);
410             char[] key = new char[2];
411             key[1] = 0;
412             for (int i = 0; i < chargroup.length(); i++) {
413                 key[0] = chargroup.charAt(i);
414                 classmap.insert(key, 0, equivChar);
415             }
416         }
417     }
418
419     /**
420      * Add an exception to the tree. It is used by
421      * {@link SimplePatternParser SimplePatternParser} class as callback to
422      * store the hyphenation exceptions.
423      * @param word normalized word
424      * @param hyphenatedword a vector of alternating strings and
425      * {@link Hyphen hyphen} objects.
426      */

427     public void addException(String JavaDoc word, ArrayList JavaDoc hyphenatedword) {
428         stoplist.put(word, hyphenatedword);
429     }
430
431     /**
432      * Add a pattern to the tree. Mainly, to be used by
433      * {@link SimplePatternParser SimplePatternParser} class as callback to
434      * add a pattern to the tree.
435      * @param pattern the hyphenation pattern
436      * @param ivalue interletter weight values indicating the
437      * desirability and priority of hyphenating at a given point
438      * within the pattern. It should contain only digit characters.
439      * (i.e. '0' to '9').
440      */

441     public void addPattern(String JavaDoc pattern, String JavaDoc ivalue) {
442         int k = ivalues.find(ivalue);
443         if (k <= 0) {
444             k = packValues(ivalue);
445             ivalues.insert(ivalue, (char)k);
446         }
447         insert(pattern, (char)k);
448     }
449
450     public void printStats() {
451         System.out.println("Value space size = "
452                            + Integer.toString(vspace.length()));
453         super.printStats();
454     }
455 }
456
Popular Tags