KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > lowagie > text > pdf > hyphenation > TernaryTree


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 package com.lowagie.text.pdf.hyphenation;
18
19 import java.io.Serializable JavaDoc;
20 import java.util.Enumeration JavaDoc;
21 import java.util.Stack JavaDoc;
22
23 /**
24  * <h2>Ternary Search Tree.</h2>
25  *
26  * <p>A ternary search tree is a hibrid between a binary tree and
27  * a digital search tree (trie). Keys are limited to strings.
28  * A data value of type char is stored in each leaf node.
29  * It can be used as an index (or pointer) to the data.
30  * Branches that only contain one key are compressed to one node
31  * by storing a pointer to the trailer substring of the key.
32  * This class is intended to serve as base class or helper class
33  * to implement Dictionary collections or the like. Ternary trees
34  * have some nice properties as the following: the tree can be
35  * traversed in sorted order, partial matches (wildcard) can be
36  * implemented, retrieval of all keys within a given distance
37  * from the target, etc. The storage requirements are higher than
38  * a binary tree but a lot less than a trie. Performance is
39  * comparable with a hash table, sometimes it outperforms a hash
40  * function (most of the time can determine a miss faster than a hash).</p>
41  *
42  * <p>The main purpose of this java port is to serve as a base for
43  * implementing TeX's hyphenation algorithm (see The TeXBook,
44  * appendix H). Each language requires from 5000 to 15000 hyphenation
45  * patterns which will be keys in this tree. The strings patterns
46  * are usually small (from 2 to 5 characters), but each char in the
47  * tree is stored in a node. Thus memory usage is the main concern.
48  * We will sacrify 'elegance' to keep memory requirenments to the
49  * minimum. Using java's char type as pointer (yes, I know pointer
50  * it is a forbidden word in java) we can keep the size of the node
51  * to be just 8 bytes (3 pointers and the data char). This gives
52  * room for about 65000 nodes. In my tests the english patterns
53  * took 7694 nodes and the german patterns 10055 nodes,
54  * so I think we are safe.</p>
55  *
56  * <p>All said, this is a map with strings as keys and char as value.
57  * Pretty limited!. It can be extended to a general map by
58  * using the string representation of an object and using the
59  * char value as an index to an array that contains the object
60  * values.</p>
61  *
62  * @author cav@uniscope.co.jp
63  */

64
65 public class TernaryTree implements Cloneable JavaDoc, Serializable JavaDoc {
66
67     /**
68      * We use 4 arrays to represent a node. I guess I should have created
69      * a proper node class, but somehow Knuth's pascal code made me forget
70      * we now have a portable language with virtual memory management and
71      * automatic garbage collection! And now is kind of late, furthermore,
72      * if it ain't broken, don't fix it.
73      */

74
75     private static final long serialVersionUID = 5313366505322983510L;
76
77     /**
78      * Pointer to low branch and to rest of the key when it is
79      * stored directly in this node, we don't have unions in java!
80      */

81     protected char[] lo;
82
83     /**
84      * Pointer to high branch.
85      */

86     protected char[] hi;
87
88     /**
89      * Pointer to equal branch and to data when this node is a string terminator.
90      */

91     protected char[] eq;
92
93     /**
94      * <P>The character stored in this node: splitchar.
95      * Two special values are reserved:</P>
96      * <ul><li>0x0000 as string terminator</li>
97      * <li>0xFFFF to indicate that the branch starting at
98      * this node is compressed</li></ul>
99      * <p>This shouldn't be a problem if we give the usual semantics to
100      * strings since 0xFFFF is garanteed not to be an Unicode character.</p>
101      */

102     protected char[] sc;
103
104     /**
105      * This vector holds the trailing of the keys when the branch is compressed.
106      */

107     protected CharVector kv;
108
109     protected char root;
110     protected char freenode;
111     protected int length; // number of items in tree
112

113     protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
114

115     TernaryTree() {
116         init();
117     }
118
119     protected void init() {
120         root = 0;
121         freenode = 1;
122         length = 0;
123         lo = new char[BLOCK_SIZE];
124         hi = new char[BLOCK_SIZE];
125         eq = new char[BLOCK_SIZE];
126         sc = new char[BLOCK_SIZE];
127         kv = new CharVector();
128     }
129
130     /**
131      * Branches are initially compressed, needing
132      * one node per key plus the size of the string
133      * key. They are decompressed as needed when
134      * another key with same prefix
135      * is inserted. This saves a lot of space,
136      * specially for long keys.
137      */

138     public void insert(String JavaDoc key, char val) {
139         // make sure we have enough room in the arrays
140
int len = key.length()
141                   + 1; // maximum number of nodes that may be generated
142
if (freenode + len > eq.length) {
143             redimNodeArrays(eq.length + BLOCK_SIZE);
144         }
145         char strkey[] = new char[len--];
146         key.getChars(0, len, strkey, 0);
147         strkey[len] = 0;
148         root = insert(root, strkey, 0, val);
149     }
150
151     public void insert(char[] key, int start, char val) {
152         int len = strlen(key) + 1;
153         if (freenode + len > eq.length) {
154             redimNodeArrays(eq.length + BLOCK_SIZE);
155         }
156         root = insert(root, key, start, val);
157     }
158
159     /**
160      * The actual insertion function, recursive version.
161      */

162     private char insert(char p, char[] key, int start, char val) {
163         int len = strlen(key, start);
164         if (p == 0) {
165             // this means there is no branch, this node will start a new branch.
166
// Instead of doing that, we store the key somewhere else and create
167
// only one node with a pointer to the key
168
p = freenode++;
169             eq[p] = val; // holds data
170
length++;
171             hi[p] = 0;
172             if (len > 0) {
173                 sc[p] = 0xFFFF; // indicates branch is compressed
174
lo[p] = (char)kv.alloc(len
175                                        + 1); // use 'lo' to hold pointer to key
176
strcpy(kv.getArray(), lo[p], key, start);
177             } else {
178                 sc[p] = 0;
179                 lo[p] = 0;
180             }
181             return p;
182         }
183
184         if (sc[p] == 0xFFFF) {
185             // branch is compressed: need to decompress
186
// this will generate garbage in the external key array
187
// but we can do some garbage collection later
188
char pp = freenode++;
189             lo[pp] = lo[p]; // previous pointer to key
190
eq[pp] = eq[p]; // previous pointer to data
191
lo[p] = 0;
192             if (len > 0) {
193                 sc[p] = kv.get(lo[pp]);
194                 eq[p] = pp;
195                 lo[pp]++;
196                 if (kv.get(lo[pp]) == 0) {
197                     // key completly decompressed leaving garbage in key array
198
lo[pp] = 0;
199                     sc[pp] = 0;
200                     hi[pp] = 0;
201                 } else {
202                     // we only got first char of key, rest is still there
203
sc[pp] = 0xFFFF;
204                 }
205             } else {
206                 // In this case we can save a node by swapping the new node
207
// with the compressed node
208
sc[pp] = 0xFFFF;
209                 hi[p] = pp;
210                 sc[p] = 0;
211                 eq[p] = val;
212                 length++;
213                 return p;
214             }
215         }
216         char s = key[start];
217         if (s < sc[p]) {
218             lo[p] = insert(lo[p], key, start, val);
219         } else if (s == sc[p]) {
220             if (s != 0) {
221                 eq[p] = insert(eq[p], key, start + 1, val);
222             } else {
223                 // key already in tree, overwrite data
224
eq[p] = val;
225             }
226         } else {
227             hi[p] = insert(hi[p], key, start, val);
228         }
229         return p;
230     }
231
232     /**
233      * Compares 2 null terminated char arrays
234      */

235     public static int strcmp(char[] a, int startA, char[] b, int startB) {
236         for (; a[startA] == b[startB]; startA++, startB++) {
237             if (a[startA] == 0) {
238                 return 0;
239             }
240         }
241         return a[startA] - b[startB];
242     }
243
244     /**
245      * Compares a string with null terminated char array
246      */

247     public static int strcmp(String JavaDoc str, char[] a, int start) {
248         int i, d, len = str.length();
249         for (i = 0; i < len; i++) {
250             d = (int)str.charAt(i) - a[start + i];
251             if (d != 0) {
252                 return d;
253             }
254             if (a[start + i] == 0) {
255                 return d;
256         }
257         }
258         if (a[start + i] != 0) {
259             return (int)-a[start + i];
260         }
261         return 0;
262
263     }
264
265     public static void strcpy(char[] dst, int di, char[] src, int si) {
266         while (src[si] != 0) {
267             dst[di++] = src[si++];
268         }
269         dst[di] = 0;
270     }
271
272     public static int strlen(char[] a, int start) {
273         int len = 0;
274         for (int i = start; i < a.length && a[i] != 0; i++) {
275             len++;
276         }
277         return len;
278     }
279
280     public static int strlen(char[] a) {
281         return strlen(a, 0);
282     }
283
284     public int find(String JavaDoc key) {
285         int len = key.length();
286         char strkey[] = new char[len + 1];
287         key.getChars(0, len, strkey, 0);
288         strkey[len] = 0;
289
290         return find(strkey, 0);
291     }
292
293     public int find(char[] key, int start) {
294         int d;
295         char p = root;
296         int i = start;
297         char c;
298
299         while (p != 0) {
300             if (sc[p] == 0xFFFF) {
301                 if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
302                     return eq[p];
303                 } else {
304                     return -1;
305             }
306             }
307             c = key[i];
308             d = c - sc[p];
309             if (d == 0) {
310                 if (c == 0) {
311                     return eq[p];
312                 }
313                 i++;
314                 p = eq[p];
315             } else if (d < 0) {
316                 p = lo[p];
317             } else {
318                 p = hi[p];
319         }
320         }
321         return -1;
322     }
323
324     public boolean knows(String JavaDoc key) {
325         return (find(key) >= 0);
326     }
327
328     // redimension the arrays
329
private void redimNodeArrays(int newsize) {
330         int len = newsize < lo.length ? newsize : lo.length;
331         char[] na = new char[newsize];
332         System.arraycopy(lo, 0, na, 0, len);
333         lo = na;
334         na = new char[newsize];
335         System.arraycopy(hi, 0, na, 0, len);
336         hi = na;
337         na = new char[newsize];
338         System.arraycopy(eq, 0, na, 0, len);
339         eq = na;
340         na = new char[newsize];
341         System.arraycopy(sc, 0, na, 0, len);
342         sc = na;
343     }
344
345     public int size() {
346         return length;
347     }
348
349     public Object JavaDoc clone() {
350         TernaryTree t = new TernaryTree();
351         t.lo = (char[])this.lo.clone();
352         t.hi = (char[])this.hi.clone();
353         t.eq = (char[])this.eq.clone();
354         t.sc = (char[])this.sc.clone();
355         t.kv = (CharVector)this.kv.clone();
356         t.root = this.root;
357         t.freenode = this.freenode;
358         t.length = this.length;
359
360         return t;
361     }
362
363     /**
364      * Recursively insert the median first and then the median of the
365      * lower and upper halves, and so on in order to get a balanced
366      * tree. The array of keys is assumed to be sorted in ascending
367      * order.
368      */

369     protected void insertBalanced(String JavaDoc[] k, char[] v, int offset, int n) {
370         int m;
371         if (n < 1) {
372             return;
373         }
374         m = n >> 1;
375
376         insert(k[m + offset], v[m + offset]);
377         insertBalanced(k, v, offset, m);
378
379         insertBalanced(k, v, offset + m + 1, n - m - 1);
380     }
381
382
383     /**
384      * Balance the tree for best search performance
385      */

386     public void balance() {
387         // System.out.print("Before root splitchar = "); System.out.println(sc[root]);
388

389         int i = 0, n = length;
390         String JavaDoc[] k = new String JavaDoc[n];
391         char[] v = new char[n];
392         Iterator iter = new Iterator();
393         while (iter.hasMoreElements()) {
394             v[i] = iter.getValue();
395             k[i++] = (String JavaDoc)iter.nextElement();
396         }
397         init();
398         insertBalanced(k, v, 0, n);
399
400         // With uniform letter distribution sc[root] should be around 'm'
401
// System.out.print("After root splitchar = "); System.out.println(sc[root]);
402
}
403
404     /**
405      * Each node stores a character (splitchar) which is part of
406      * some key(s). In a compressed branch (one that only contain
407      * a single string key) the trailer of the key which is not
408      * already in nodes is stored externally in the kv array.
409      * As items are inserted, key substrings decrease.
410      * Some substrings may completely disappear when the whole
411      * branch is totally decompressed.
412      * The tree is traversed to find the key substrings actually
413      * used. In addition, duplicate substrings are removed using
414      * a map (implemented with a TernaryTree!).
415      *
416      */

417     public void trimToSize() {
418         // first balance the tree for best performance
419
balance();
420
421         // redimension the node arrays
422
redimNodeArrays(freenode);
423
424         // ok, compact kv array
425
CharVector kx = new CharVector();
426         kx.alloc(1);
427         TernaryTree map = new TernaryTree();
428         compact(kx, map, root);
429         kv = kx;
430         kv.trimToSize();
431     }
432
433     private void compact(CharVector kx, TernaryTree map, char p) {
434         int k;
435         if (p == 0) {
436             return;
437         }
438         if (sc[p] == 0xFFFF) {
439             k = map.find(kv.getArray(), lo[p]);
440             if (k < 0) {
441                 k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
442                 strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
443                 map.insert(kx.getArray(), k, (char)k);
444             }
445             lo[p] = (char)k;
446         } else {
447             compact(kx, map, lo[p]);
448             if (sc[p] != 0) {
449                 compact(kx, map, eq[p]);
450             }
451             compact(kx, map, hi[p]);
452         }
453     }
454
455
456     public Enumeration JavaDoc keys() {
457         return new Iterator();
458     }
459
460     public class Iterator implements Enumeration JavaDoc {
461
462         /**
463          * current node index
464          */

465         int cur;
466
467         /**
468          * current key
469          */

470         String JavaDoc curkey;
471
472         private class Item implements Cloneable JavaDoc {
473             char parent;
474             char child;
475
476             public Item() {
477                 parent = 0;
478                 child = 0;
479             }
480
481             public Item(char p, char c) {
482                 parent = p;
483                 child = c;
484             }
485
486             public Object JavaDoc clone() {
487                 return new Item(parent, child);
488             }
489
490         }
491
492         /**
493          * Node stack
494          */

495         Stack JavaDoc ns;
496
497         /**
498          * key stack implemented with a StringBuffer
499          */

500         StringBuffer JavaDoc ks;
501
502         public Iterator() {
503             cur = -1;
504             ns = new Stack JavaDoc();
505             ks = new StringBuffer JavaDoc();
506             rewind();
507         }
508
509         public void rewind() {
510             ns.removeAllElements();
511             ks.setLength(0);
512             cur = root;
513             run();
514         }
515
516         public Object JavaDoc nextElement() {
517             String JavaDoc res = curkey;
518             cur = up();
519             run();
520             return res;
521         }
522
523         public char getValue() {
524             if (cur >= 0) {
525                 return eq[cur];
526             }
527             return 0;
528         }
529
530         public boolean hasMoreElements() {
531             return (cur != -1);
532         }
533
534         /**
535          * traverse upwards
536          */

537         private int up() {
538             Item i = new Item();
539             int res = 0;
540
541             if (ns.empty()) {
542                 return -1;
543             }
544
545             if (cur != 0 && sc[cur] == 0) {
546                 return lo[cur];
547             }
548
549             boolean climb = true;
550
551             while (climb) {
552                 i = (Item)ns.pop();
553                 i.child++;
554                 switch (i.child) {
555                 case 1:
556                     if (sc[i.parent] != 0) {
557                         res = eq[i.parent];
558                         ns.push(i.clone());
559                         ks.append(sc[i.parent]);
560                     } else {
561                         i.child++;
562                         ns.push(i.clone());
563                         res = hi[i.parent];
564                     }
565                     climb = false;
566                     break;
567
568                 case 2:
569                     res = hi[i.parent];
570                     ns.push(i.clone());
571                     if (ks.length() > 0) {
572                         ks.setLength(ks.length() - 1); // pop
573
}
574                     climb = false;
575                     break;
576
577                 default:
578                     if (ns.empty()) {
579                         return -1;
580                     }
581                     climb = true;
582                     break;
583                 }
584             }
585             return res;
586         }
587
588         /**
589          * traverse the tree to find next key
590          */

591         private int run() {
592             if (cur == -1) {
593                 return -1;
594             }
595
596             boolean leaf = false;
597             while (true) {
598                 // first go down on low branch until leaf or compressed branch
599
while (cur != 0) {
600                     if (sc[cur] == 0xFFFF) {
601                         leaf = true;
602                         break;
603                     }
604                     ns.push(new Item((char)cur, '\u0000'));
605                     if (sc[cur] == 0) {
606                         leaf = true;
607                         break;
608                     }
609                     cur = lo[cur];
610                 }
611                 if (leaf) {
612                     break;
613                 }
614                     // nothing found, go up one node and try again
615
cur = up();
616                 if (cur == -1) {
617                     return -1;
618                 }
619             }
620             // The current node should be a data node and
621
// the key should be in the key stack (at least partially)
622
StringBuffer JavaDoc buf = new StringBuffer JavaDoc(ks.toString());
623             if (sc[cur] == 0xFFFF) {
624                 int p = lo[cur];
625                 while (kv.get(p) != 0) {
626                     buf.append(kv.get(p++));
627             }
628             }
629             curkey = buf.toString();
630             return 0;
631         }
632
633     }
634
635     public void printStats() {
636         System.out.println("Number of keys = " + Integer.toString(length));
637         System.out.println("Node count = " + Integer.toString(freenode));
638         // System.out.println("Array length = " + Integer.toString(eq.length));
639
System.out.println("Key Array length = "
640                            + Integer.toString(kv.length()));
641
642         /*
643          * for(int i=0; i<kv.length(); i++)
644          * if ( kv.get(i) != 0 )
645          * System.out.print(kv.get(i));
646          * else
647          * System.out.println("");
648          * System.out.println("Keys:");
649          * for(Enumeration enum = keys(); enum.hasMoreElements(); )
650          * System.out.println(enum.nextElement());
651          */

652
653     }
654
655 /* public static void main(String[] args) throws Exception {
656         TernaryTree tt = new TernaryTree();
657         tt.insert("Carlos", 'C');
658         tt.insert("Car", 'r');
659         tt.insert("palos", 'l');
660         tt.insert("pa", 'p');
661         tt.trimToSize();
662         System.out.println((char)tt.find("Car"));
663         System.out.println((char)tt.find("Carlos"));
664         System.out.println((char)tt.find("alto"));
665         tt.printStats();
666     }*/

667
668 }
669
670
Popular Tags