HyphenationTree


1   /*
2    * Copyright 1999-2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  /* $Id: HyphenationTree.java 2623 2007-02-23 22:28:28Z xlv $ */
18   
19  package com.lowagie.text.pdf.hyphenation;
20  
21  import java.io.InputStream  ;
22  import java.util.ArrayList  ;
23  import java.util.HashMap  ;
24  
25  /**
26   * This tree structure stores the hyphenation patterns in an efficient
27   * way for fast lookup. It provides the provides the method to
28   * hyphenate a word.
29   *
30   * @author Carlos Villegas <cav@uniscope.co.jp>
31   */
32  public class HyphenationTree extends TernaryTree 
33              implements PatternConsumer {
34  
35      private static final long serialVersionUID = -7763254239309429432L;
36  
37      /**
38       * value space: stores the inteletter values
39       */
40      protected ByteVector vspace;
41  
42      /**
43       * This map stores hyphenation exceptions
44       */
45      protected HashMap   stoplist;
46  
47      /**
48       * This map stores the character classes
49       */
50      protected TernaryTree classmap;
51  
52      /**
53       * Temporary map to store interletter values on pattern loading.
54       */
55      private transient TernaryTree ivalues;
56  
57      public HyphenationTree() {
58          stoplist = new HashMap  (23);    // usually a small table
59          classmap = new TernaryTree();
60          vspace = new ByteVector();
61          vspace.alloc(1);    // this reserves index 0, which we don't use
62      }
63  
64      /**
65       * Packs the values by storing them in 4 bits, two values into a byte
66       * Values range is from 0 to 9. We use zero as terminator,
67       * so we'll add 1 to the value.
68       * @param values a string of digits from '0' to '9' representing the
69       * interletter values.
70       * @return the index into the vspace array where the packed values
71       * are stored.
72       */
73      protected int packValues(String   values) {
74          int i, n = values.length();
75          int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
76          int offset = vspace.alloc(m);
77          byte[] va = vspace.getArray();
78          for (i = 0; i < n; i++) {
79              int j = i >> 1;
80              byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f);
81              if ((i & 1) == 1) {
82                  va[j + offset] = (byte)(va[j + offset] | v);
83              } else {
84                  va[j + offset] = (byte)(v << 4);    // big endian
85              }
86          }
87          va[m - 1 + offset] = 0;    // terminator
88          return offset;
89      }
90  
91      protected String   unpackValues(int k) {
92          StringBuffer   buf = new StringBuffer  ();
93          byte v = vspace.get(k++);
94          while (v != 0) {
95              char c = (char)((v >>> 4) - 1 + '0');
96              buf.append(c);
97              c = (char)(v & 0x0f);
98              if (c == 0) {
99                  break;
100             }
101             c = (char)(c - 1 + '0');
102             buf.append(c);
103             v = vspace.get(k++);
104         }
105         return buf.toString();
106     }
107 
108     public void loadSimplePatterns(InputStream   stream) {
109         SimplePatternParser pp = new SimplePatternParser();
110         ivalues = new TernaryTree();
111 
112         pp.parse(stream, this);
113 
114         // patterns/values should be now in the tree
115         // let's optimize a bit
116         trimToSize();
117         vspace.trimToSize();
118         classmap.trimToSize();
119 
120         // get rid of the auxiliary map
121         ivalues = null;
122     }
123 
124 
125     public String   findPattern(String   pat) {
126         int k = super.find(pat);
127         if (k >= 0) {
128             return unpackValues(k);
129         }
130         return "";
131     }
132 
133     /**
134      * String compare, returns 0 if equal or
135      * t is a substring of s
136      */
137     protected int hstrcmp(char[] s, int si, char[] t, int ti) {
138         for (; s[si] == t[ti]; si++, ti++) {
139             if (s[si] == 0) {
140                 return 0;
141             }
142         }
143         if (t[ti] == 0) {
144             return 0;
145         }
146         return s[si] - t[ti];
147     }
148 
149     protected byte[] getValues(int k) {
150         StringBuffer   buf = new StringBuffer  ();
151         byte v = vspace.get(k++);
152         while (v != 0) {
153             char c = (char)((v >>> 4) - 1);
154             buf.append(c);
155             c = (char)(v & 0x0f);
156             if (c == 0) {
157                 break;
158             }
159             c = (char)(c - 1);
160             buf.append(c);
161             v = vspace.get(k++);
162         }
163         byte[] res = new byte[buf.length()];
164         for (int i = 0; i < res.length; i++) {
165             res[i] = (byte)buf.charAt(i);
166         }
167         return res;
168     }
169 
170     /**
171      * <p>Search for all possible partial matches of word starting
172      * at index an update interletter values. In other words, it
173      * does something like:</p>
174      * <code>
175      * for(i=0; i<patterns.length; i++) {
176      * if ( word.substring(index).startsWidth(patterns[i]) )
177      * update_interletter_values(patterns[i]);
178      * }
179      * </code>
180      * <p>But it is done in an efficient way since the patterns are
181      * stored in a ternary tree. In fact, this is the whole purpose
182      * of having the tree: doing this search without having to test
183      * every single pattern. The number of patterns for languages
184      * such as English range from 4000 to 10000. Thus, doing thousands
185      * of string comparisons for each word to hyphenate would be
186      * really slow without the tree. The tradeoff is memory, but
187      * using a ternary tree instead of a trie, almost halves the
188      * the memory used by Lout or TeX. It's also faster than using
189      * a hash table</p>
190      * @param word null terminated word to match
191      * @param index start index from word
192      * @param il interletter values array to update
193      */
194     protected void searchPatterns(char[] word, int index, byte[] il) {
195         byte[] values;
196         int i = index;
197         char p, q;
198         char sp = word[i];
199         p = root;
200 
201         while (p > 0 && p < sc.length) {
202             if (sc[p] == 0xFFFF) {
203                 if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
204                     values = getValues(eq[p]);    // data pointer is in eq[]
205                     int j = index;
206                     for (int k = 0; k < values.length; k++) {
207                         if (j < il.length && values[k] > il[j]) {
208                             il[j] = values[k];
209                         }
210                         j++;
211                     }
212                 }
213                 return;
214             }
215             int d = sp - sc[p];
216             if (d == 0) {
217                 if (sp == 0) {
218                     break;
219                 }
220                 sp = word[++i];
221                 p = eq[p];
222                 q = p;
223 
224                 // look for a pattern ending at this position by searching for
225                 // the null char ( splitchar == 0 )
226                 while (q > 0 && q < sc.length) {
227                     if (sc[q] == 0xFFFF) {        // stop at compressed branch
228                         break;
229                     }
230                     if (sc[q] == 0) {
231                         values = getValues(eq[q]);
232                         int j = index;
233                         for (int k = 0; k < values.length; k++) {
234                             if (j < il.length && values[k] > il[j]) {
235                                 il[j] = values[k];
236                             }
237                             j++;
238                         }
239                         break;
240                     } else {
241                         q = lo[q];
242 
243                         /**
244                          * actually the code should be:
245                          * q = sc[q] < 0 ? hi[q] : lo[q];
246                          * but java chars are unsigned
247                          */
248                     }
249                 }
250             } else {
251                 p = d < 0 ? lo[p] : hi[p];
252             }
253         }
254     }
255 
256     /**
257      * Hyphenate word and return a Hyphenation object.
258      * @param word the word to be hyphenated
259      * @param remainCharCount Minimum number of characters allowed
260      * before the hyphenation point.
261      * @param pushCharCount Minimum number of characters allowed after
262      * the hyphenation point.
263      * @return a {@link Hyphenation Hyphenation} object representing
264      * the hyphenated word or null if word is not hyphenated.
265      */
266     public Hyphenation hyphenate(String   word, int remainCharCount,
267                                  int pushCharCount) {
268         char[] w = word.toCharArray();
269         return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
270     }
271 
272     /**
273      * w = "****nnllllllnnn*****",
274      * where n is a non-letter, l is a letter,
275      * all n may be absent, the first n is at offset,
276      * the first l is at offset + iIgnoreAtBeginning;
277      * word = ".llllll.'\0'***",
278      * where all l in w are copied into word.
279      * In the first part of the routine len = w.length,
280      * in the second part of the routine len = word.length.
281      * Three indices are used:
282      * index(w), the index in w,
283      * index(word), the index in word,
284      * letterindex(word), the index in the letter part of word.
285      * The following relations exist:
286      * index(w) = offset + i - 1
287      * index(word) = i - iIgnoreAtBeginning
288      * letterindex(word) = index(word) - 1
289      * (see first loop).
290      * It follows that:
291      * index(w) - index(word) = offset - 1 + iIgnoreAtBeginning
292      * index(w) = letterindex(word) + offset + iIgnoreAtBeginning
293      */
294 
295     /**
296      * Hyphenate word and return an array of hyphenation points.
297      * @param w char array that contains the word
298      * @param offset Offset to first character in word
299      * @param len Length of word
300      * @param remainCharCount Minimum number of characters allowed
301      * before the hyphenation point.
302      * @param pushCharCount Minimum number of characters allowed after
303      * the hyphenation point.
304      * @return a {@link Hyphenation Hyphenation} object representing
305      * the hyphenated word or null if word is not hyphenated.
306      */
307     public Hyphenation hyphenate(char[] w, int offset, int len,
308                                  int remainCharCount, int pushCharCount) {
309         int i;
310         char[] word = new char[len + 3];
311 
312         // normalize word
313         char[] c = new char[2];
314         int iIgnoreAtBeginning = 0;
315         int iLength = len;
316         boolean bEndOfLetters = false;
317         for (i = 1; i <= len; i++) {
318             c[0] = w[offset + i - 1];
319             int nc = classmap.find(c, 0);
320             if (nc < 0) {    // found a non-letter character ...
321                 if (i == (1 + iIgnoreAtBeginning)) {
322                     // ... before any letter character
323                     iIgnoreAtBeginning ++;
324                 } else {
325                     // ... after a letter character
326                     bEndOfLetters = true;
327                 }
328                 iLength --;
329             } else {
330                 if (!bEndOfLetters) {
331                     word[i - iIgnoreAtBeginning] = (char)nc;
332                 } else {
333                     return null;
334                 }
335             }
336         }
337         len = iLength;
338         if (len < (remainCharCount + pushCharCount)) {
339             // word is too short to be hyphenated
340             return null;
341         }
342         int[] result = new int[len + 1];
343         int k = 0;
344 
345         // check exception list first
346         String   sw = new String  (word, 1, len);
347         if (stoplist.containsKey(sw)) {
348             // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null)
349             ArrayList   hw = (ArrayList  )stoplist.get(sw);
350             int j = 0;
351             for (i = 0; i < hw.size(); i++) {
352                 Object   o = hw.get(i);
353                 // j = index(sw) = letterindex(word)?
354                 // result[k] = corresponding index(w)
355                 if (o instanceof String  ) {
356                     j += ((String  )o).length();
357                     if (j >= remainCharCount && j < (len - pushCharCount)) {
358                         result[k++] = j + iIgnoreAtBeginning;
359                     }
360                 }
361             }
362         } else {
363             // use algorithm to get hyphenation points
364             word[0] = '.';                    // word start marker
365             word[len + 1] = '.';              // word end marker
366             word[len + 2] = 0;                // null terminated
367             byte[] il = new byte[len + 3];    // initialized to zero
368             for (i = 0; i < len + 1; i++) {
369                 searchPatterns(word, i, il);
370             }
371 
372             // hyphenation points are located where interletter value is odd
373             // i is letterindex(word),
374             // i + 1 is index(word),
375             // result[k] = corresponding index(w)
376             for (i = 0; i < len; i++) {
377                 if (((il[i + 1] & 1) == 1) && i >= remainCharCount
378                         && i <= (len - pushCharCount)) {
379                     result[k++] = i + iIgnoreAtBeginning;
380                 }
381             }
382         }
383 
384 
385         if (k > 0) {
386             // trim result array
387             int[] res = new int[k];
388             System.arraycopy(result, 0, res, 0, k);
389             return new Hyphenation(new String  (w, offset, len), res);
390         } else {
391             return null;
392         }
393     }
394 
395     /**
396      * Add a character class to the tree. It is used by
397      * {@link SimplePatternParser SimplePatternParser} as callback to
398      * add character classes. Character classes define the
399      * valid word characters for hyphenation. If a word contains
400      * a character not defined in any of the classes, it is not hyphenated.
401      * It also defines a way to normalize the characters in order
402      * to compare them with the stored patterns. Usually pattern
403      * files use only lower case characters, in this case a class
404      * for letter 'a', for example, should be defined as "aA", the first
405      * character being the normalization char.
406      */
407     public void addClass(String   chargroup) {
408         if (chargroup.length() > 0) {
409             char equivChar = chargroup.charAt(0);
410             char[] key = new char[2];
411             key[1] = 0;
412             for (int i = 0; i < chargroup.length(); i++) {
413                 key[0] = chargroup.charAt(i);
414                 classmap.insert(key, 0, equivChar);
415             }
416         }
417     }
418 
419     /**
420      * Add an exception to the tree. It is used by
421      * {@link SimplePatternParser SimplePatternParser} class as callback to
422      * store the hyphenation exceptions.
423      * @param word normalized word
424      * @param hyphenatedword a vector of alternating strings and
425      * {@link Hyphen hyphen} objects.
426      */
427     public void addException(String   word, ArrayList   hyphenatedword) {
428         stoplist.put(word, hyphenatedword);
429     }
430 
431     /**
432      * Add a pattern to the tree. Mainly, to be used by
433      * {@link SimplePatternParser SimplePatternParser} class as callback to
434      * add a pattern to the tree.
435      * @param pattern the hyphenation pattern
436      * @param ivalue interletter weight values indicating the
437      * desirability and priority of hyphenating at a given point
438      * within the pattern. It should contain only digit characters.
439      * (i.e. '0' to '9').
440      */
441     public void addPattern(String   pattern, String   ivalue) {
442         int k = ivalues.find(ivalue);
443         if (k <= 0) {
444             k = packValues(ivalue);
445             ivalues.insert(ivalue, (char)k);
446         }
447         insert(pattern, (char)k);
448     }
449 
450     public void printStats() {
451         System.out.println("Value space size = "
452                            + Integer.toString(vspace.length()));
453         super.printStats();
454     }
455 }
456
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags