KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > fop > hyphenation > TernaryTree


1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements. See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */

17
18 /* $Id: TernaryTree.java 426576 2006-07-28 15:44:37Z jeremias $ */
19
20 package org.apache.fop.hyphenation;
21
22 import java.util.Enumeration JavaDoc;
23 import java.util.Stack JavaDoc;
24 import java.io.Serializable JavaDoc;
25
26 /**
27  * <h2>Ternary Search Tree.</h2>
28  *
29  * <p>A ternary search tree is a hibrid between a binary tree and
30  * a digital search tree (trie). Keys are limited to strings.
31  * A data value of type char is stored in each leaf node.
32  * It can be used as an index (or pointer) to the data.
33  * Branches that only contain one key are compressed to one node
34  * by storing a pointer to the trailer substring of the key.
35  * This class is intended to serve as base class or helper class
36  * to implement Dictionary collections or the like. Ternary trees
37  * have some nice properties as the following: the tree can be
38  * traversed in sorted order, partial matches (wildcard) can be
39  * implemented, retrieval of all keys within a given distance
40  * from the target, etc. The storage requirements are higher than
41  * a binary tree but a lot less than a trie. Performance is
42  * comparable with a hash table, sometimes it outperforms a hash
43  * function (most of the time can determine a miss faster than a hash).</p>
44  *
45  * <p>The main purpose of this java port is to serve as a base for
46  * implementing TeX's hyphenation algorithm (see The TeXBook,
47  * appendix H). Each language requires from 5000 to 15000 hyphenation
48  * patterns which will be keys in this tree. The strings patterns
49  * are usually small (from 2 to 5 characters), but each char in the
50  * tree is stored in a node. Thus memory usage is the main concern.
51  * We will sacrify 'elegance' to keep memory requirenments to the
52  * minimum. Using java's char type as pointer (yes, I know pointer
53  * it is a forbidden word in java) we can keep the size of the node
54  * to be just 8 bytes (3 pointers and the data char). This gives
55  * room for about 65000 nodes. In my tests the english patterns
56  * took 7694 nodes and the german patterns 10055 nodes,
57  * so I think we are safe.</p>
58  *
59  * <p>All said, this is a map with strings as keys and char as value.
60  * Pretty limited!. It can be extended to a general map by
61  * using the string representation of an object and using the
62  * char value as an index to an array that contains the object
63  * values.</p>
64  *
65  * @author cav@uniscope.co.jp
66  */

67
68 public class TernaryTree implements Cloneable JavaDoc, Serializable JavaDoc {
69
70     /**
71      * We use 4 arrays to represent a node. I guess I should have created
72      * a proper node class, but somehow Knuth's pascal code made me forget
73      * we now have a portable language with virtual memory management and
74      * automatic garbage collection! And now is kind of late, furthermore,
75      * if it ain't broken, don't fix it.
76      */

77
78     /**
79      * Pointer to low branch and to rest of the key when it is
80      * stored directly in this node, we don't have unions in java!
81      */

82     protected char[] lo;
83
84     /**
85      * Pointer to high branch.
86      */

87     protected char[] hi;
88
89     /**
90      * Pointer to equal branch and to data when this node is a string terminator.
91      */

92     protected char[] eq;
93
94     /**
95      * <P>The character stored in this node: splitchar.
96      * Two special values are reserved:</P>
97      * <ul><li>0x0000 as string terminator</li>
98      * <li>0xFFFF to indicate that the branch starting at
99      * this node is compressed</li></ul>
100      * <p>This shouldn't be a problem if we give the usual semantics to
101      * strings since 0xFFFF is garanteed not to be an Unicode character.</p>
102      */

103     protected char[] sc;
104
105     /**
106      * This vector holds the trailing of the keys when the branch is compressed.
107      */

108     protected CharVector kv;
109
110     protected char root;
111     protected char freenode;
112     protected int length; // number of items in tree
113

114     protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
115

116     TernaryTree() {
117         init();
118     }
119
120     protected void init() {
121         root = 0;
122         freenode = 1;
123         length = 0;
124         lo = new char[BLOCK_SIZE];
125         hi = new char[BLOCK_SIZE];
126         eq = new char[BLOCK_SIZE];
127         sc = new char[BLOCK_SIZE];
128         kv = new CharVector();
129     }
130
131     /**
132      * Branches are initially compressed, needing
133      * one node per key plus the size of the string
134      * key. They are decompressed as needed when
135      * another key with same prefix
136      * is inserted. This saves a lot of space,
137      * specially for long keys.
138      */

139     public void insert(String JavaDoc key, char val) {
140         // make sure we have enough room in the arrays
141
int len = key.length()
142                   + 1; // maximum number of nodes that may be generated
143
if (freenode + len > eq.length) {
144             redimNodeArrays(eq.length + BLOCK_SIZE);
145         }
146         char strkey[] = new char[len--];
147         key.getChars(0, len, strkey, 0);
148         strkey[len] = 0;
149         root = insert(root, strkey, 0, val);
150     }
151
152     public void insert(char[] key, int start, char val) {
153         int len = strlen(key) + 1;
154         if (freenode + len > eq.length) {
155             redimNodeArrays(eq.length + BLOCK_SIZE);
156         }
157         root = insert(root, key, start, val);
158     }
159
160     /**
161      * The actual insertion function, recursive version.
162      */

163     private char insert(char p, char[] key, int start, char val) {
164         int len = strlen(key, start);
165         if (p == 0) {
166             // this means there is no branch, this node will start a new branch.
167
// Instead of doing that, we store the key somewhere else and create
168
// only one node with a pointer to the key
169
p = freenode++;
170             eq[p] = val; // holds data
171
length++;
172             hi[p] = 0;
173             if (len > 0) {
174                 sc[p] = 0xFFFF; // indicates branch is compressed
175
lo[p] = (char)kv.alloc(len
176                                        + 1); // use 'lo' to hold pointer to key
177
strcpy(kv.getArray(), lo[p], key, start);
178             } else {
179                 sc[p] = 0;
180                 lo[p] = 0;
181             }
182             return p;
183         }
184
185         if (sc[p] == 0xFFFF) {
186             // branch is compressed: need to decompress
187
// this will generate garbage in the external key array
188
// but we can do some garbage collection later
189
char pp = freenode++;
190             lo[pp] = lo[p]; // previous pointer to key
191
eq[pp] = eq[p]; // previous pointer to data
192
lo[p] = 0;
193             if (len > 0) {
194                 sc[p] = kv.get(lo[pp]);
195                 eq[p] = pp;
196                 lo[pp]++;
197                 if (kv.get(lo[pp]) == 0) {
198                     // key completly decompressed leaving garbage in key array
199
lo[pp] = 0;
200                     sc[pp] = 0;
201                     hi[pp] = 0;
202                 } else {
203                     // we only got first char of key, rest is still there
204
sc[pp] = 0xFFFF;
205                 }
206             } else {
207                 // In this case we can save a node by swapping the new node
208
// with the compressed node
209
sc[pp] = 0xFFFF;
210                 hi[p] = pp;
211                 sc[p] = 0;
212                 eq[p] = val;
213                 length++;
214                 return p;
215             }
216         }
217         char s = key[start];
218         if (s < sc[p]) {
219             lo[p] = insert(lo[p], key, start, val);
220         } else if (s == sc[p]) {
221             if (s != 0) {
222                 eq[p] = insert(eq[p], key, start + 1, val);
223             } else {
224                 // key already in tree, overwrite data
225
eq[p] = val;
226             }
227         } else {
228             hi[p] = insert(hi[p], key, start, val);
229         }
230         return p;
231     }
232
233     /**
234      * Compares 2 null terminated char arrays
235      */

236     public static int strcmp(char[] a, int startA, char[] b, int startB) {
237         for (; a[startA] == b[startB]; startA++, startB++) {
238             if (a[startA] == 0) {
239                 return 0;
240             }
241         }
242         return a[startA] - b[startB];
243     }
244
245     /**
246      * Compares a string with null terminated char array
247      */

248     public static int strcmp(String JavaDoc str, char[] a, int start) {
249         int i, d, len = str.length();
250         for (i = 0; i < len; i++) {
251             d = (int)str.charAt(i) - a[start + i];
252             if (d != 0) {
253                 return d;
254             }
255             if (a[start + i] == 0) {
256                 return d;
257             }
258         }
259         if (a[start + i] != 0) {
260             return (int)-a[start + i];
261         }
262         return 0;
263
264     }
265
266     public static void strcpy(char[] dst, int di, char[] src, int si) {
267         while (src[si] != 0) {
268             dst[di++] = src[si++];
269         }
270         dst[di] = 0;
271     }
272
273     public static int strlen(char[] a, int start) {
274         int len = 0;
275         for (int i = start; i < a.length && a[i] != 0; i++) {
276             len++;
277         }
278         return len;
279     }
280
281     public static int strlen(char[] a) {
282         return strlen(a, 0);
283     }
284
285     public int find(String JavaDoc key) {
286         int len = key.length();
287         char strkey[] = new char[len + 1];
288         key.getChars(0, len, strkey, 0);
289         strkey[len] = 0;
290
291         return find(strkey, 0);
292     }
293
294     public int find(char[] key, int start) {
295         int d;
296         char p = root;
297         int i = start;
298         char c;
299
300         while (p != 0) {
301             if (sc[p] == 0xFFFF) {
302                 if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
303                     return eq[p];
304                 } else {
305                     return -1;
306                 }
307             }
308             c = key[i];
309             d = c - sc[p];
310             if (d == 0) {
311                 if (c == 0) {
312                     return eq[p];
313                 }
314                 i++;
315                 p = eq[p];
316             } else if (d < 0) {
317                 p = lo[p];
318             } else {
319                 p = hi[p];
320             }
321         }
322         return -1;
323     }
324
325     public boolean knows(String JavaDoc key) {
326         return (find(key) >= 0);
327     }
328
329     // redimension the arrays
330
private void redimNodeArrays(int newsize) {
331         int len = newsize < lo.length ? newsize : lo.length;
332         char[] na = new char[newsize];
333         System.arraycopy(lo, 0, na, 0, len);
334         lo = na;
335         na = new char[newsize];
336         System.arraycopy(hi, 0, na, 0, len);
337         hi = na;
338         na = new char[newsize];
339         System.arraycopy(eq, 0, na, 0, len);
340         eq = na;
341         na = new char[newsize];
342         System.arraycopy(sc, 0, na, 0, len);
343         sc = na;
344     }
345
346     public int size() {
347         return length;
348     }
349
350     public Object JavaDoc clone() {
351         TernaryTree t = new TernaryTree();
352         t.lo = (char[])this.lo.clone();
353         t.hi = (char[])this.hi.clone();
354         t.eq = (char[])this.eq.clone();
355         t.sc = (char[])this.sc.clone();
356         t.kv = (CharVector)this.kv.clone();
357         t.root = this.root;
358         t.freenode = this.freenode;
359         t.length = this.length;
360
361         return t;
362     }
363
364     /**
365      * Recursively insert the median first and then the median of the
366      * lower and upper halves, and so on in order to get a balanced
367      * tree. The array of keys is assumed to be sorted in ascending
368      * order.
369      */

370     protected void insertBalanced(String JavaDoc[] k, char[] v, int offset, int n) {
371         int m;
372         if (n < 1) {
373             return;
374         }
375         m = n >> 1;
376
377         insert(k[m + offset], v[m + offset]);
378         insertBalanced(k, v, offset, m);
379
380         insertBalanced(k, v, offset + m + 1, n - m - 1);
381     }
382
383
384     /**
385      * Balance the tree for best search performance
386      */

387     public void balance() {
388         // System.out.print("Before root splitchar = "); System.out.println(sc[root]);
389

390         int i = 0, n = length;
391         String JavaDoc[] k = new String JavaDoc[n];
392         char[] v = new char[n];
393         Iterator iter = new Iterator();
394         while (iter.hasMoreElements()) {
395             v[i] = iter.getValue();
396             k[i++] = (String JavaDoc)iter.nextElement();
397         }
398         init();
399         insertBalanced(k, v, 0, n);
400
401         // With uniform letter distribution sc[root] should be around 'm'
402
// System.out.print("After root splitchar = "); System.out.println(sc[root]);
403
}
404
405     /**
406      * Each node stores a character (splitchar) which is part of
407      * some key(s). In a compressed branch (one that only contain
408      * a single string key) the trailer of the key which is not
409      * already in nodes is stored externally in the kv array.
410      * As items are inserted, key substrings decrease.
411      * Some substrings may completely disappear when the whole
412      * branch is totally decompressed.
413      * The tree is traversed to find the key substrings actually
414      * used. In addition, duplicate substrings are removed using
415      * a map (implemented with a TernaryTree!).
416      *
417      */

418     public void trimToSize() {
419         // first balance the tree for best performance
420
balance();
421
422         // redimension the node arrays
423
redimNodeArrays(freenode);
424
425         // ok, compact kv array
426
CharVector kx = new CharVector();
427         kx.alloc(1);
428         TernaryTree map = new TernaryTree();
429         compact(kx, map, root);
430         kv = kx;
431         kv.trimToSize();
432     }
433
434     private void compact(CharVector kx, TernaryTree map, char p) {
435         int k;
436         if (p == 0) {
437             return;
438         }
439         if (sc[p] == 0xFFFF) {
440             k = map.find(kv.getArray(), lo[p]);
441             if (k < 0) {
442                 k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
443                 strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
444                 map.insert(kx.getArray(), k, (char)k);
445             }
446             lo[p] = (char)k;
447         } else {
448             compact(kx, map, lo[p]);
449             if (sc[p] != 0) {
450                 compact(kx, map, eq[p]);
451             }
452             compact(kx, map, hi[p]);
453         }
454     }
455
456
457     public Enumeration JavaDoc keys() {
458         return new Iterator();
459     }
460
461     public class Iterator implements Enumeration JavaDoc {
462
463         /**
464          * current node index
465          */

466         int cur;
467
468         /**
469          * current key
470          */

471         String JavaDoc curkey;
472
473         private class Item implements Cloneable JavaDoc {
474             char parent;
475             char child;
476
477             public Item() {
478                 parent = 0;
479                 child = 0;
480             }
481
482             public Item(char p, char c) {
483                 parent = p;
484                 child = c;
485             }
486
487             public Object JavaDoc clone() {
488                 return new Item(parent, child);
489             }
490
491         }
492
493         /**
494          * Node stack
495          */

496         Stack JavaDoc ns;
497
498         /**
499          * key stack implemented with a StringBuffer
500          */

501         StringBuffer JavaDoc ks;
502
503         public Iterator() {
504             cur = -1;
505             ns = new Stack JavaDoc();
506             ks = new StringBuffer JavaDoc();
507             rewind();
508         }
509
510         public void rewind() {
511             ns.removeAllElements();
512             ks.setLength(0);
513             cur = root;
514             run();
515         }
516
517         public Object JavaDoc nextElement() {
518             String JavaDoc res = new String JavaDoc(curkey);
519             cur = up();
520             run();
521             return res;
522         }
523
524         public char getValue() {
525             if (cur >= 0) {
526                 return eq[cur];
527             }
528             return 0;
529         }
530
531         public boolean hasMoreElements() {
532             return (cur != -1);
533         }
534
535         /**
536          * traverse upwards
537          */

538         private int up() {
539             Item i = new Item();
540             int res = 0;
541
542             if (ns.empty()) {
543                 return -1;
544             }
545
546             if (cur != 0 && sc[cur] == 0) {
547                 return lo[cur];
548             }
549
550             boolean climb = true;
551
552             while (climb) {
553                 i = (Item)ns.pop();
554                 i.child++;
555                 switch (i.child) {
556                 case 1:
557                     if (sc[i.parent] != 0) {
558                         res = eq[i.parent];
559                         ns.push(i.clone());
560                         ks.append(sc[i.parent]);
561                     } else {
562                         i.child++;
563                         ns.push(i.clone());
564                         res = hi[i.parent];
565                     }
566                     climb = false;
567                     break;
568
569                 case 2:
570                     res = hi[i.parent];
571                     ns.push(i.clone());
572                     if (ks.length() > 0) {
573                         ks.setLength(ks.length() - 1); // pop
574
}
575                     climb = false;
576                     break;
577
578                 default:
579                     if (ns.empty()) {
580                         return -1;
581                     }
582                     climb = true;
583                     break;
584                 }
585             }
586             return res;
587         }
588
589         /**
590          * traverse the tree to find next key
591          */

592         private int run() {
593             if (cur == -1) {
594                 return -1;
595             }
596
597             boolean leaf = false;
598             while (true) {
599                 // first go down on low branch until leaf or compressed branch
600
while (cur != 0) {
601                     if (sc[cur] == 0xFFFF) {
602                         leaf = true;
603                         break;
604                     }
605                     ns.push(new Item((char)cur, '\u0000'));
606                     if (sc[cur] == 0) {
607                         leaf = true;
608                         break;
609                     }
610                     cur = lo[cur];
611                 }
612                 if (leaf) {
613                     break;
614                 }
615                 // nothing found, go up one node and try again
616
cur = up();
617                 if (cur == -1) {
618                     return -1;
619                 }
620             }
621             // The current node should be a data node and
622
// the key should be in the key stack (at least partially)
623
StringBuffer JavaDoc buf = new StringBuffer JavaDoc(ks.toString());
624             if (sc[cur] == 0xFFFF) {
625                 int p = lo[cur];
626                 while (kv.get(p) != 0) {
627                     buf.append(kv.get(p++));
628                 }
629             }
630             curkey = buf.toString();
631             return 0;
632         }
633
634     }
635
636     public void printStats() {
637         System.out.println("Number of keys = " + Integer.toString(length));
638         System.out.println("Node count = " + Integer.toString(freenode));
639         // System.out.println("Array length = " + Integer.toString(eq.length));
640
System.out.println("Key Array length = "
641                            + Integer.toString(kv.length()));
642
643         /*
644          * for(int i=0; i<kv.length(); i++)
645          * if ( kv.get(i) != 0 )
646          * System.out.print(kv.get(i));
647          * else
648          * System.out.println("");
649          * System.out.println("Keys:");
650          * for(Enumeration enum = keys(); enum.hasMoreElements(); )
651          * System.out.println(enum.nextElement());
652          */

653
654     }
655
656     public static void main(String JavaDoc[] args) throws Exception JavaDoc {
657         TernaryTree tt = new TernaryTree();
658         tt.insert("Carlos", 'C');
659         tt.insert("Car", 'r');
660         tt.insert("palos", 'l');
661         tt.insert("pa", 'p');
662         tt.trimToSize();
663         System.out.println((char)tt.find("Car"));
664         System.out.println((char)tt.find("Carlos"));
665         System.out.println((char)tt.find("alto"));
666         tt.printStats();
667     }
668
669 }
670
671
Popular Tags