KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > fop > layout > hyphenation > HyphenationTree


1 /*
2  * $Id: HyphenationTree.java,v 1.5.2.4 2003/02/25 14:07:10 jeremias Exp $
3  * ============================================================================
4  * The Apache Software License, Version 1.1
5  * ============================================================================
6  *
7  * Copyright (C) 1999-2003 The Apache Software Foundation. All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without modifica-
10  * tion, are permitted provided that the following conditions are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  *
19  * 3. The end-user documentation included with the redistribution, if any, must
20  * include the following acknowledgment: "This product includes software
21  * developed by the Apache Software Foundation (http://www.apache.org/)."
22  * Alternately, this acknowledgment may appear in the software itself, if
23  * and wherever such third-party acknowledgments normally appear.
24  *
25  * 4. The names "FOP" and "Apache Software Foundation" must not be used to
26  * endorse or promote products derived from this software without prior
27  * written permission. For written permission, please contact
28  * apache@apache.org.
29  *
30  * 5. Products derived from this software may not be called "Apache", nor may
31  * "Apache" appear in their name, without prior written permission of the
32  * Apache Software Foundation.
33  *
34  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
35  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
36  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
37  * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
38  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU-
39  * DING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
40  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
41  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
42  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
43  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44  * ============================================================================
45  *
46  * This software consists of voluntary contributions made by many individuals
47  * on behalf of the Apache Software Foundation and was originally created by
48  * James Tauber <jtauber@jtauber.com>. For more information on the Apache
49  * Software Foundation, please see <http://www.apache.org/>.
50  */

51 package org.apache.fop.layout.hyphenation;
52
53 import java.io.*;
54 import java.util.ArrayList JavaDoc;
55 import java.util.HashMap JavaDoc;
56
57 /**
58  * This tree structure stores the hyphenation patterns in an efficient
59  * way for fast lookup. It provides the provides the method to
60  * hyphenate a word.
61  *
62  * @author Carlos Villegas <cav@uniscope.co.jp>
63  */

64 public class HyphenationTree extends TernaryTree implements PatternConsumer,
65         Serializable {
66
67     /**
68      * value space: stores the inteletter values
69      */

70     protected ByteVector vspace;
71
72     /**
73      * This map stores hyphenation exceptions
74      */

75     protected HashMap JavaDoc stoplist;
76
77     /**
78      * This map stores the character classes
79      */

80     protected TernaryTree classmap;
81
82     /**
83      * Temporary map to store interletter values on pattern loading.
84      */

85     private transient TernaryTree ivalues;
86
87     public HyphenationTree() {
88         stoplist = new HashMap JavaDoc(23); // usually a small table
89
classmap = new TernaryTree();
90         vspace = new ByteVector();
91         vspace.alloc(1); // this reserves index 0, which we don't use
92
}
93
94     /**
95      * Packs the values by storing them in 4 bits, two values into a byte
96      * Values range is from 0 to 9. We use zero as terminator,
97      * so we'll add 1 to the value.
98      * @param values a string of digits from '0' to '9' representing the
99      * interletter values.
100      * @return the index into the vspace array where the packed values
101      * are stored.
102      */

103     protected int packValues(String JavaDoc values) {
104         int i, n = values.length();
105         int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
106         int offset = vspace.alloc(m);
107         byte[] va = vspace.getArray();
108         for (i = 0; i < n; i++) {
109             int j = i >> 1;
110             byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f);
111             if ((i & 1) == 1)
112                 va[j + offset] = (byte)(va[j + offset] | v);
113             else
114                 va[j + offset] = (byte)(v << 4); // big endian
115
}
116         va[m - 1 + offset] = 0; // terminator
117
return offset;
118     }
119
120     protected String JavaDoc unpackValues(int k) {
121         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
122         byte v = vspace.get(k++);
123         while (v != 0) {
124             char c = (char)((v >>> 4) - 1 + '0');
125             buf.append(c);
126             c = (char)(v & 0x0f);
127             if (c == 0)
128                 break;
129             c = (char)(c - 1 + '0');
130             buf.append(c);
131             v = vspace.get(k++);
132         }
133         return buf.toString();
134     }
135
136     /**
137      * Read hyphenation patterns from an XML file.
138      */

139     public void loadPatterns(String JavaDoc filename) throws HyphenationException {
140         PatternParser pp = new PatternParser(this);
141         ivalues = new TernaryTree();
142
143         pp.parse(filename);
144
145         // patterns/values should be now in the tree
146
// let's optimize a bit
147
trimToSize();
148         vspace.trimToSize();
149         classmap.trimToSize();
150
151         // get rid of the auxiliary map
152
ivalues = null;
153     }
154
155     public String JavaDoc findPattern(String JavaDoc pat) {
156         int k = super.find(pat);
157         if (k >= 0)
158             return unpackValues(k);
159         return "";
160     }
161
162     /**
163      * String compare, returns 0 if equal or
164      * t is a substring of s
165      */

166     protected int hstrcmp(char[] s, int si, char[] t, int ti) {
167         for (; s[si] == t[ti]; si++, ti++)
168             if (s[si] == 0)
169                 return 0;
170         if (t[ti] == 0)
171             return 0;
172         return s[si] - t[ti];
173     }
174
175     protected byte[] getValues(int k) {
176         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
177         byte v = vspace.get(k++);
178         while (v != 0) {
179             char c = (char)((v >>> 4) - 1);
180             buf.append(c);
181             c = (char)(v & 0x0f);
182             if (c == 0)
183                 break;
184             c = (char)(c - 1);
185             buf.append(c);
186             v = vspace.get(k++);
187         }
188         byte[] res = new byte[buf.length()];
189         for (int i = 0; i < res.length; i++)
190             res[i] = (byte)buf.charAt(i);
191         return res;
192     }
193
194     /**
195      * <p>Search for all possible partial matches of word starting
196      * at index an update interletter values. In other words, it
197      * does something like:</p>
198      * <code>
199      * for(i=0; i<patterns.length; i++) {
200      * if ( word.substring(index).startsWidth(patterns[i]) )
201      * update_interletter_values(patterns[i]);
202      * }
203      * </code>
204      * <p>But it is done in an efficient way since the patterns are
205      * stored in a ternary tree. In fact, this is the whole purpose
206      * of having the tree: doing this search without having to test
207      * every single pattern. The number of patterns for languages
208      * such as English range from 4000 to 10000. Thus, doing thousands
209      * of string comparisons for each word to hyphenate would be
210      * really slow without the tree. The tradeoff is memory, but
211      * using a ternary tree instead of a trie, almost halves the
212      * the memory used by Lout or TeX. It's also faster than using
213      * a hash table</p>
214      * @param word null terminated word to match
215      * @param index start index from word
216      * @param il interletter values array to update
217      */

218     protected void searchPatterns(char[] word, int index, byte[] il) {
219         byte[] values;
220         int i = index;
221         char p, q;
222         char sp = word[i];
223         p = root;
224
225         while (p > 0 && p < sc.length) {
226             if (sc[p] == 0xFFFF) {
227                 if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
228                     values = getValues(eq[p]); // data pointer is in eq[]
229
int j = index;
230                     for (int k = 0; k < values.length; k++) {
231                         if (j < il.length && values[k] > il[j])
232                             il[j] = values[k];
233                         j++;
234                     }
235                 }
236                 return;
237             }
238             int d = sp - sc[p];
239             if (d == 0) {
240                 if (sp == 0) {
241                     break;
242                 }
243                 sp = word[++i];
244                 p = eq[p];
245                 q = p;
246
247                 // look for a pattern ending at this position by searching for
248
// the null char ( splitchar == 0 )
249
while (q > 0 && q < sc.length) {
250                     if (sc[q] == 0xFFFF) { // stop at compressed branch
251
break;
252                     }
253                     if (sc[q] == 0) {
254                         values = getValues(eq[q]);
255                         int j = index;
256                         for (int k = 0; k < values.length; k++) {
257                             if (j < il.length && values[k] > il[j]) {
258                                 il[j] = values[k];
259                             }
260                             j++;
261                         }
262                         break;
263                     } else {
264                         q = lo[q];
265
266                         /**
267                          * actually the code should be:
268                          * q = sc[q] < 0 ? hi[q] : lo[q];
269                          * but java chars are unsigned
270                          */

271                     }
272                 }
273             } else
274                 p = d < 0 ? lo[p] : hi[p];
275         }
276     }
277
278     /**
279      * Hyphenate word and return a Hyphenation object.
280      * @param word the word to be hyphenated
281      * @param remainCharCount Minimum number of characters allowed
282      * before the hyphenation point.
283      * @param pushCharCount Minimum number of characters allowed after
284      * the hyphenation point.
285      * @return a {@link Hyphenation Hyphenation} object representing
286      * the hyphenated word or null if word is not hyphenated.
287      */

288     public Hyphenation hyphenate(String JavaDoc word, int remainCharCount,
289                                  int pushCharCount) {
290         char[] w = word.toCharArray();
291         return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
292     }
293
294     /**
295      * Hyphenate word and return an array of hyphenation points.
296      * @param w char array that contains the word
297      * @param offset Offset to first character in word
298      * @param len Length of word
299      * @param remainCharCount Minimum number of characters allowed
300      * before the hyphenation point.
301      * @param pushCharCount Minimum number of characters allowed after
302      * the hyphenation point.
303      * @return a {@link Hyphenation Hyphenation} object representing
304      * the hyphenated word or null if word is not hyphenated.
305      */

306     public Hyphenation hyphenate(char[] w, int offset, int len,
307                                  int remainCharCount, int pushCharCount) {
308         int i;
309         char[] word = new char[len + 3];
310
311         // normalize word
312
char[] c = new char[2];
313         for (i = 1; i <= len; i++) {
314             c[0] = w[offset + i - 1];
315             int nc = classmap.find(c, 0);
316             if (nc < 0) { // found a non-letter character, abort
317
return null;
318             }
319             word[i] = (char)nc;
320         }
321         int[] result = new int[len + 1];
322         int k = 0;
323
324         // check exception list first
325
String JavaDoc sw = new String JavaDoc(word, 1, len);
326         if (stoplist.containsKey(sw)) {
327             // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null)
328
ArrayList JavaDoc hw = (ArrayList JavaDoc)stoplist.get(sw);
329             int j = 0;
330             for (i = 0; i < hw.size(); i++) {
331                 Object JavaDoc o = hw.get(i);
332                 if (o instanceof String JavaDoc) {
333                     j += ((String JavaDoc)o).length();
334                     if (j >= remainCharCount && j < (len - pushCharCount))
335                         result[k++] = j;
336                 }
337             }
338         } else {
339             // use algorithm to get hyphenation points
340
word[0] = '.'; // word start marker
341
word[len + 1] = '.'; // word end marker
342
word[len + 2] = 0; // null terminated
343
byte[] il = new byte[len + 3]; // initialized to zero
344
for (i = 0; i < len + 1; i++) {
345                 searchPatterns(word, i, il);
346             }
347
348             // hyphenation points are located where interletter value is odd
349
for (i = 0; i < len; i++) {
350                 if (((il[i + 1] & 1) == 1) && i >= remainCharCount
351                         && i <= (len - pushCharCount)) {
352                     result[k++] = i;
353                 }
354             }
355         }
356
357
358         if (k > 0) {
359             // trim result array
360
int[] res = new int[k];
361             System.arraycopy(result, 0, res, 0, k);
362             return new Hyphenation(new String JavaDoc(w, offset, len), res);
363         } else {
364             return null;
365         }
366     }
367
368     /**
369      * Add a character class to the tree. It is used by
370      * {@link PatternParser PatternParser} as callback to
371      * add character classes. Character classes define the
372      * valid word characters for hyphenation. If a word contains
373      * a character not defined in any of the classes, it is not hyphenated.
374      * It also defines a way to normalize the characters in order
375      * to compare them with the stored patterns. Usually pattern
376      * files use only lower case characters, in this case a class
377      * for letter 'a', for example, should be defined as "aA", the first
378      * character being the normalization char.
379      */

380     public void addClass(String JavaDoc chargroup) {
381         if (chargroup.length() > 0) {
382             char equivChar = chargroup.charAt(0);
383             char[] key = new char[2];
384             key[1] = 0;
385             for (int i = 0; i < chargroup.length(); i++) {
386                 key[0] = chargroup.charAt(i);
387                 classmap.insert(key, 0, equivChar);
388             }
389         }
390     }
391
392     /**
393      * Add an exception to the tree. It is used by
394      * {@link PatternParser PatternParser} class as callback to
395      * store the hyphenation exceptions.
396      * @param word normalized word
397      * @param hyphenatedword a ArrayList of alternating strings and
398      * {@link Hyphen hyphen} objects.
399      */

400     public void addException(String JavaDoc word, ArrayList JavaDoc hyphenatedword) {
401         stoplist.put(word, hyphenatedword);
402     }
403
404     /**
405      * Add a pattern to the tree. Mainly, to be used by
406      * {@link PatternParser PatternParser} class as callback to
407      * add a pattern to the tree.
408      * @param pattern the hyphenation pattern
409      * @param ivalue interletter weight values indicating the
410      * desirability and priority of hyphenating at a given point
411      * within the pattern. It should contain only digit characters.
412      * (i.e. '0' to '9').
413      */

414     public void addPattern(String JavaDoc pattern, String JavaDoc ivalue) {
415         int k = ivalues.find(ivalue);
416         if (k <= 0) {
417             k = packValues(ivalue);
418             ivalues.insert(ivalue, (char)k);
419         }
420         insert(pattern, (char)k);
421     }
422
423     public void printStats() {
424         System.out.println("Value space size = "
425                            + Integer.toString(vspace.length()));
426         super.printStats();
427
428     }
429
430     public static void main(String JavaDoc[] argv) throws Exception JavaDoc {
431         HyphenationTree ht = null;
432         int minCharCount = 2;
433         BufferedReader in =
434             new BufferedReader(new InputStreamReader(System.in));
435         for (; ; ) {
436             System.out.print("l:\tload patterns from XML\n"
437                     + "L:\tload patterns from serialized object\n"
438                     + "s:\tset minimun character count\n"
439                     + "w:\twrite hyphenation tree to object file\n"
440                     + "h:\thyphenate\n"
441                     + "f:\tfind pattern\n"
442                     + "b:\tbenchmark\n"
443                     + "q:\tquit\n"
444                     + "\nCommand:");
445             String JavaDoc token = in.readLine().trim();
446             if (token.equals("f")) {
447                 System.out.print("Pattern: ");
448                 token = in.readLine().trim();
449                 System.out.println("Values: " + ht.findPattern(token));
450             } else if (token.equals("s")) {
451                 System.out.print("Minimun value: ");
452                 token = in.readLine().trim();
453                 minCharCount = Integer.parseInt(token);
454             } else if (token.equals("l")) {
455                 ht = new HyphenationTree();
456                 System.out.print("XML file name: ");
457                 token = in.readLine().trim();
458                 ht.loadPatterns(token);
459             } else if (token.equals("L")) {
460                 ObjectInputStream ois = null;
461                 System.out.print("Object file name: ");
462                 token = in.readLine().trim();
463                 try {
464                     ois = new ObjectInputStream(new FileInputStream(token));
465                     ht = (HyphenationTree)ois.readObject();
466                 } catch (Exception JavaDoc e) {
467                     e.printStackTrace();
468                 }
469                 finally {
470                     if (ois != null) {
471                         try {
472                             ois.close();
473                         } catch (IOException e) {}
474                     }
475                 }
476             } else if (token.equals("w")) {
477                 System.out.print("Object file name: ");
478                 token = in.readLine().trim();
479                 ObjectOutputStream oos = null;
480                 try {
481                     oos = new ObjectOutputStream(new FileOutputStream(token));
482                     oos.writeObject(ht);
483                 } catch (Exception JavaDoc e) {
484                     e.printStackTrace();
485                 }
486                 finally {
487                     if (oos != null) {
488                         try {
489                             oos.flush();
490                         } catch (IOException e) {}
491                         try {
492                             oos.close();
493                         } catch (IOException e) {}
494                     }
495                 }
496             } else if (token.equals("h")) {
497                 System.out.print("Word: ");
498                 token = in.readLine().trim();
499                 System.out.print("Hyphenation points: ");
500                 System.out.println(ht.hyphenate(token, minCharCount,
501                                                 minCharCount));
502             } else if (token.equals("b")) {
503                 if (ht == null) {
504                     System.out.println("No patterns has been loaded.");
505                     break;
506                 }
507                 System.out.print("Word list filename: ");
508                 token = in.readLine().trim();
509                 long starttime = 0;
510                 int counter = 0;
511                 ;
512                 try {
513                     BufferedReader reader =
514                         new BufferedReader(new FileReader(token));
515                     String JavaDoc line;
516
517                     starttime = System.currentTimeMillis();
518                     while ((line = reader.readLine()) != null) {
519                         // System.out.print("\nline: ");
520
Hyphenation hyp = ht.hyphenate(line, minCharCount,
521                                                        minCharCount);
522                         if (hyp != null) {
523                             String JavaDoc hword = hyp.toString();
524                             // System.out.println(line);
525
// System.out.println(hword);
526
} else {
527                             // System.out.println("No hyphenation");
528
}
529                         counter++;
530                     }
531                 } catch (Exception JavaDoc ioe) {
532                     System.out.println("Exception " + ioe);
533                     ioe.printStackTrace();
534                 }
535                 long endtime = System.currentTimeMillis();
536                 long result = endtime - starttime;
537                 System.out.println(counter + " words in " + result
538                                    + " Millisekunden hyphenated");
539
540             } else if (token.equals("q"))
541                 break;
542         }
543
544     }
545
546 }
547
Popular Tags