KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > fop > layout > hyphenation > TernaryTree


1 /*
2  * $Id: TernaryTree.java,v 1.5.2.1 2003/02/25 14:07:11 jeremias Exp $
3  * ============================================================================
4  * The Apache Software License, Version 1.1
5  * ============================================================================
6  *
7  * Copyright (C) 1999-2003 The Apache Software Foundation. All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without modifica-
10  * tion, are permitted provided that the following conditions are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  *
19  * 3. The end-user documentation included with the redistribution, if any, must
20  * include the following acknowledgment: "This product includes software
21  * developed by the Apache Software Foundation (http://www.apache.org/)."
22  * Alternately, this acknowledgment may appear in the software itself, if
23  * and wherever such third-party acknowledgments normally appear.
24  *
25  * 4. The names "FOP" and "Apache Software Foundation" must not be used to
26  * endorse or promote products derived from this software without prior
27  * written permission. For written permission, please contact
28  * apache@apache.org.
29  *
30  * 5. Products derived from this software may not be called "Apache", nor may
31  * "Apache" appear in their name, without prior written permission of the
32  * Apache Software Foundation.
33  *
34  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
35  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
36  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
37  * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
38  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU-
39  * DING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
40  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
41  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
42  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
43  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44  * ============================================================================
45  *
46  * This software consists of voluntary contributions made by many individuals
47  * on behalf of the Apache Software Foundation and was originally created by
48  * James Tauber <jtauber@jtauber.com>. For more information on the Apache
49  * Software Foundation, please see <http://www.apache.org/>.
50  */

51 package org.apache.fop.layout.hyphenation;
52
53 import java.util.Enumeration JavaDoc;
54 import java.util.Stack JavaDoc;
55 import java.io.Serializable JavaDoc;
56
57 /**
58  * <h2>Ternary Search Tree</h2>
59  *
60  * <p>A ternary search tree is a hibrid between a binary tree and
61  * a digital search tree (trie). Keys are limited to strings.
62  * A data value of type char is stored in each leaf node.
63  * It can be used as an index (or pointer) to the data.
64  * Branches that only contain one key are compressed to one node
65  * by storing a pointer to the trailer substring of the key.
66  * This class is intended to serve as base class or helper class
67  * to implement Dictionary collections or the like. Ternary trees
68  * have some nice properties as the following: the tree can be
69  * traversed in sorted order, partial matches (wildcard) can be
70  * implemented, retrieval of all keys within a given distance
71  * from the target, etc. The storage requirements are higher than
72  * a binary tree but a lot less than a trie. Performance is
73  * comparable with a hash table, sometimes it outperforms a hash
74  * function (most of the time can determine a miss faster than a hash).</p>
75  *
76  * <p>The main purpose of this java port is to serve as a base for
77  * implementing TeX's hyphenation algorithm (see The TeXBook,
78  * appendix H). Each language requires from 5000 to 15000 hyphenation
79  * patterns which will be keys in this tree. The strings patterns
80  * are usually small (from 2 to 5 characters), but each char in the
81  * tree is stored in a node. Thus memory usage is the main concern.
82  * We will sacrify 'elegance' to keep memory requirenments to the
83  * minimum. Using java's char type as pointer (yes, I know pointer
84  * it is a forbidden word in java) we can keep the size of the node
85  * to be just 8 bytes (3 pointers and the data char). This gives
86  * room for about 65000 nodes. In my tests the english patterns
87  * took 7694 nodes and the german patterns 10055 nodes,
88  * so I think we are safe.</p>
89  *
90  * <p>All said, this is a map with strings as keys and char as value.
91  * Pretty limited!. It can be extended to a general map by
92  * using the string representation of an object and using the
93  * char value as an index to an array that contains the object
94  * values.</p>
95  *
96  * @author cav@uniscope.co.jp
97  */

98
99 public class TernaryTree implements Cloneable JavaDoc, Serializable JavaDoc {
100
101     /**
102      * We use 4 arrays to represent a node. I guess I should have created
103      * a proper node class, but somehow Knuth's pascal code made me forget
104      * we now have a portable language with virtual memory management and
105      * automatic garbage collection! And now is kind of late, furthermore,
106      * if it ain't broken, don't fix it.
107      */

108
109     /**
110      * Pointer to low branch and to rest of the key when it is
111      * stored directly in this node, we don't have unions in java!
112      */

113     protected char[] lo;
114
115     /**
116      * Pointer to high branch.
117      */

118     protected char[] hi;
119
120     /**
121      * Pointer to equal branch and to data when this node is a string terminator.
122      */

123     protected char[] eq;
124
125     /**
126      * <P>The character stored in this node: splitchar
127      * Two special values are reserved:</P>
128      * <ul><li>0x0000 as string terminator</li>
129      * <li>0xFFFF to indicate that the branch starting at
130      * this node is compressed</li></ul>
131      * <p>This shouldn't be a problem if we give the usual semantics to
132      * strings since 0xFFFF is garanteed not to be an Unicode character.</p>
133      */

134     protected char[] sc;
135
136     /**
137      * This vector holds the trailing of the keys when the branch is compressed.
138      */

139     protected CharVector kv;
140
141     protected char root;
142     protected char freenode;
143     protected int length; // number of items in tree
144

145     protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
146

147     TernaryTree() {
148         init();
149     }
150
151     protected void init() {
152         root = 0;
153         freenode = 1;
154         length = 0;
155         lo = new char[BLOCK_SIZE];
156         hi = new char[BLOCK_SIZE];
157         eq = new char[BLOCK_SIZE];
158         sc = new char[BLOCK_SIZE];
159         kv = new CharVector();
160     }
161
162     /**
163      * Branches are initially compressed, needing
164      * one node per key plus the size of the string
165      * key. They are decompressed as needed when
166      * another key with same prefix
167      * is inserted. This saves a lot of space,
168      * specially for long keys.
169      */

170     public void insert(String JavaDoc key, char val) {
171         // make sure we have enough room in the arrays
172
int len = key.length()
173                   + 1; // maximum number of nodes that may be generated
174
if (freenode + len > eq.length)
175             redimNodeArrays(eq.length + BLOCK_SIZE);
176         char strkey[] = new char[len--];
177         key.getChars(0, len, strkey, 0);
178         strkey[len] = 0;
179         root = insert(root, strkey, 0, val);
180     }
181
182     public void insert(char[] key, int start, char val) {
183         int len = strlen(key) + 1;
184         if (freenode + len > eq.length)
185             redimNodeArrays(eq.length + BLOCK_SIZE);
186         root = insert(root, key, start, val);
187     }
188
189     /**
190      * The actual insertion function, recursive version.
191      */

192     private char insert(char p, char[] key, int start, char val) {
193         int len = strlen(key, start);
194         if (p == 0) {
195             // this means there is no branch, this node will start a new branch.
196
// Instead of doing that, we store the key somewhere else and create
197
// only one node with a pointer to the key
198
p = freenode++;
199             eq[p] = val; // holds data
200
length++;
201             hi[p] = 0;
202             if (len > 0) {
203                 sc[p] = 0xFFFF; // indicates branch is compressed
204
lo[p] = (char)kv.alloc(len
205                                        + 1); // use 'lo' to hold pointer to key
206
strcpy(kv.getArray(), lo[p], key, start);
207             } else {
208                 sc[p] = 0;
209                 lo[p] = 0;
210             }
211             return p;
212         }
213
214         if (sc[p] == 0xFFFF) {
215             // branch is compressed: need to decompress
216
// this will generate garbage in the external key array
217
// but we can do some garbage collection later
218
char pp = freenode++;
219             lo[pp] = lo[p]; // previous pointer to key
220
eq[pp] = eq[p]; // previous pointer to data
221
lo[p] = 0;
222             if (len > 0) {
223                 sc[p] = kv.get(lo[pp]);
224                 eq[p] = pp;
225                 lo[pp]++;
226                 if (kv.get(lo[pp]) == 0) {
227                     // key completly decompressed leaving garbage in key array
228
lo[pp] = 0;
229                     sc[pp] = 0;
230                     hi[pp] = 0;
231                 } else
232                     sc[pp] =
233                         0xFFFF; // we only got first char of key, rest is still there
234
} else {
235                 // In this case we can save a node by swapping the new node
236
// with the compressed node
237
sc[pp] = 0xFFFF;
238                 hi[p] = pp;
239                 sc[p] = 0;
240                 eq[p] = val;
241                 length++;
242                 return p;
243             }
244         }
245         char s = key[start];
246         if (s < sc[p])
247             lo[p] = insert(lo[p], key, start, val);
248         else if (s == sc[p]) {
249             if (s != 0)
250                 eq[p] = insert(eq[p], key, start + 1, val);
251             else {
252                 // key already in tree, overwrite data
253
eq[p] = val;
254             }
255
256         } else
257             hi[p] = insert(hi[p], key, start, val);
258         return p;
259     }
260
261     /**
262      * Compares 2 null terminated char arrays
263      */

264     public static int strcmp(char[] a, int startA, char[] b, int startB) {
265         for (; a[startA] == b[startB]; startA++, startB++)
266             if (a[startA] == 0)
267                 return 0;
268         return a[startA] - b[startB];
269     }
270
271     /**
272      * Compares a string with null terminated char array
273      */

274     public static int strcmp(String JavaDoc str, char[] a, int start) {
275         int i, d, len = str.length();
276         for (i = 0; i < len; i++) {
277             d = (int)str.charAt(i) - a[start + i];
278             if (d != 0)
279                 return d;
280             if (a[start + i] == 0)
281                 return d;
282         }
283         if (a[start + i] != 0)
284             return (int)-a[start + i];
285         return 0;
286
287     }
288
289     public static void strcpy(char[] dst, int di, char[] src, int si) {
290         while (src[si] != 0)
291             dst[di++] = src[si++];
292         dst[di] = 0;
293     }
294
295     public static int strlen(char[] a, int start) {
296         int len = 0;
297         for (int i = start; i < a.length && a[i] != 0; i++)
298             len++;
299         return len;
300     }
301
302     public static int strlen(char[] a) {
303         return strlen(a, 0);
304     }
305
306     public int find(String JavaDoc key) {
307         int len = key.length();
308         char strkey[] = new char[len + 1];
309         key.getChars(0, len, strkey, 0);
310         strkey[len] = 0;
311
312         return find(strkey, 0);
313     }
314
315     public int find(char[] key, int start) {
316         int d;
317         char p = root;
318         int i = start;
319         char c;
320
321         while (p != 0) {
322             if (sc[p] == 0xFFFF) {
323                 if (strcmp(key, i, kv.getArray(), lo[p]) == 0)
324                     return eq[p];
325                 else
326                     return -1;
327             }
328             c = key[i];
329             d = c - sc[p];
330             if (d == 0) {
331                 if (c == 0)
332                     return eq[p];
333                 i++;
334                 p = eq[p];
335             } else if (d < 0)
336                 p = lo[p];
337             else
338                 p = hi[p];
339         }
340         return -1;
341     }
342
343     public boolean knows(String JavaDoc key) {
344         return (find(key) >= 0);
345     }
346
347     // redimension the arrays
348
private void redimNodeArrays(int newsize) {
349         int len = newsize < lo.length ? newsize : lo.length;
350         char[] na = new char[newsize];
351         System.arraycopy(lo, 0, na, 0, len);
352         lo = na;
353         na = new char[newsize];
354         System.arraycopy(hi, 0, na, 0, len);
355         hi = na;
356         na = new char[newsize];
357         System.arraycopy(eq, 0, na, 0, len);
358         eq = na;
359         na = new char[newsize];
360         System.arraycopy(sc, 0, na, 0, len);
361         sc = na;
362     }
363
364     public int size() {
365         return length;
366     }
367
368     public Object JavaDoc clone() {
369         TernaryTree t = new TernaryTree();
370         t.lo = (char[])this.lo.clone();
371         t.hi = (char[])this.hi.clone();
372         t.eq = (char[])this.eq.clone();
373         t.sc = (char[])this.sc.clone();
374         t.kv = (CharVector)this.kv.clone();
375         t.root = this.root;
376         t.freenode = this.freenode;
377         t.length = this.length;
378
379         return t;
380     }
381
382     /**
383      * Recursively insert the median first and then the median of the
384      * lower and upper halves, and so on in order to get a balanced
385      * tree. The array of keys is assumed to be sorted in ascending
386      * order.
387      */

388     protected void insertBalanced(String JavaDoc[] k, char[] v, int offset, int n) {
389         int m;
390         if (n < 1)
391             return;
392         m = n >> 1;
393
394         insert(k[m + offset], v[m + offset]);
395         insertBalanced(k, v, offset, m);
396
397         insertBalanced(k, v, offset + m + 1, n - m - 1);
398     }
399
400
401     /**
402      * Balance the tree for best search performance
403      */

404     public void balance() {
405         // System.out.print("Before root splitchar = "); System.out.println(sc[root]);
406

407         int i = 0, n = length;
408         String JavaDoc[] k = new String JavaDoc[n];
409         char[] v = new char[n];
410         Iterator iter = new Iterator();
411         while (iter.hasMoreElements()) {
412             v[i] = iter.getValue();
413             k[i++] = (String JavaDoc)iter.nextElement();
414         }
415         init();
416         insertBalanced(k, v, 0, n);
417
418         // With uniform letter distribution sc[root] should be around 'm'
419
// System.out.print("After root splitchar = "); System.out.println(sc[root]);
420
}
421
422     /**
423      * Each node stores a character (splitchar) which is part of
424      * some key(s). In a compressed branch (one that only contain
425      * a single string key) the trailer of the key which is not
426      * already in nodes is stored externally in the kv array.
427      * As items are inserted, key substrings decrease.
428      * Some substrings may completely disappear when the whole
429      * branch is totally decompressed.
430      * The tree is traversed to find the key substrings actually
431      * used. In addition, duplicate substrings are removed using
432      * a map (implemented with a TernaryTree!).
433      *
434      */

435     public void trimToSize() {
436         // first balance the tree for best performance
437
balance();
438
439         // redimension the node arrays
440
redimNodeArrays(freenode);
441
442         // ok, compact kv array
443
CharVector kx = new CharVector();
444         kx.alloc(1);
445         TernaryTree map = new TernaryTree();
446         compact(kx, map, root);
447         kv = kx;
448         kv.trimToSize();
449     }
450
451     private void compact(CharVector kx, TernaryTree map, char p) {
452         int k;
453         if (p == 0)
454             return;
455         if (sc[p] == 0xFFFF) {
456             k = map.find(kv.getArray(), lo[p]);
457             if (k < 0) {
458                 k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
459                 strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
460                 map.insert(kx.getArray(), k, (char)k);
461             }
462             lo[p] = (char)k;
463         } else {
464             compact(kx, map, lo[p]);
465             if (sc[p] != 0)
466                 compact(kx, map, eq[p]);
467             compact(kx, map, hi[p]);
468         }
469     }
470
471
472     public Enumeration JavaDoc keys() {
473         return new Iterator();
474     }
475
476     public class Iterator implements Enumeration JavaDoc {
477
478         /**
479          * current node index
480          */

481         int cur;
482
483         /**
484          * current key
485          */

486         String JavaDoc curkey;
487
488         private class Item implements Cloneable JavaDoc {
489             char parent;
490             char child;
491
492             public Item() {
493                 parent = 0;
494                 child = 0;
495             }
496
497             public Item(char p, char c) {
498                 parent = p;
499                 child = c;
500             }
501
502             public Object JavaDoc clone() {
503                 return new Item(parent, child);
504             }
505
506         }
507
508         /**
509          * Node stack
510          */

511         Stack JavaDoc ns;
512
513         /**
514          * key stack implemented with a StringBuffer
515          */

516         StringBuffer JavaDoc ks;
517
518         public Iterator() {
519             cur = -1;
520             ns = new Stack JavaDoc();
521             ks = new StringBuffer JavaDoc();
522             rewind();
523         }
524
525         public void rewind() {
526             ns.removeAllElements();
527             ks.setLength(0);
528             cur = root;
529             run();
530         }
531
532         public Object JavaDoc nextElement() {
533             String JavaDoc res = new String JavaDoc(curkey);
534             cur = up();
535             run();
536             return res;
537         }
538
539         public char getValue() {
540             if (cur >= 0)
541                 return eq[cur];
542             return 0;
543         }
544
545         public boolean hasMoreElements() {
546             return (cur != -1);
547         }
548
549         /**
550          * traverse upwards
551          */

552         private int up() {
553             Item i = new Item();
554             int res = 0;
555
556             if (ns.empty())
557                 return -1;
558
559             if (cur != 0 && sc[cur] == 0)
560                 return lo[cur];
561
562             boolean climb = true;
563
564             while (climb) {
565                 i = (Item)ns.pop();
566                 i.child++;
567                 switch (i.child) {
568                 case 1:
569                     if (sc[i.parent] != 0) {
570                         res = eq[i.parent];
571                         ns.push(i.clone());
572                         ks.append(sc[i.parent]);
573                     } else {
574                         i.child++;
575                         ns.push(i.clone());
576                         res = hi[i.parent];
577                     }
578                     climb = false;
579                     break;
580
581                 case 2:
582                     res = hi[i.parent];
583                     ns.push(i.clone());
584                     if (ks.length() > 0)
585                         ks.setLength(ks.length() - 1); // pop
586
climb = false;
587                     break;
588
589                 default:
590                     if (ns.empty())
591                         return -1;
592                     climb = true;
593                     break;
594                 }
595             }
596             return res;
597         }
598
599         /**
600          * traverse the tree to find next key
601          */

602         private int run() {
603             if (cur == -1)
604                 return -1;
605
606             boolean leaf = false;
607             for (; ; ) {
608                 // first go down on low branch until leaf or compressed branch
609
while (cur != 0) {
610                     if (sc[cur] == 0xFFFF) {
611                         leaf = true;
612                         break;
613                     }
614                     ns.push(new Item((char)cur, '\u0000'));
615                     if (sc[cur] == 0) {
616                         leaf = true;
617                         break;
618                     }
619                     cur = lo[cur];
620                 }
621                 if (leaf)
622                     break;
623                     // nothing found, go up one node and try again
624
cur = up();
625                 if (cur == -1) {
626                     return -1;
627                 }
628             }
629             // The current node should be a data node and
630
// the key should be in the key stack (at least partially)
631
StringBuffer JavaDoc buf = new StringBuffer JavaDoc(ks.toString());
632             if (sc[cur] == 0xFFFF) {
633                 int p = lo[cur];
634                 while (kv.get(p) != 0)
635                     buf.append(kv.get(p++));
636             }
637             curkey = buf.toString();
638             return 0;
639         }
640
641     }
642
643     public void printStats() {
644         System.out.println("Number of keys = " + Integer.toString(length));
645         System.out.println("Node count = " + Integer.toString(freenode));
646         // System.out.println("Array length = " + Integer.toString(eq.length));
647
System.out.println("Key Array length = "
648                            + Integer.toString(kv.length()));
649
650         /*
651          * for(int i=0; i<kv.length(); i++)
652          * if ( kv.get(i) != 0 )
653          * System.out.print(kv.get(i));
654          * else
655          * System.out.println("");
656          * System.out.println("Keys:");
657          * for(Enumeration enum = keys(); enum.hasMoreElements(); )
658          * System.out.println(enum.nextElement());
659          */

660
661     }
662
663     public static void main(String JavaDoc[] args) throws Exception JavaDoc {
664         TernaryTree tt = new TernaryTree();
665         tt.insert("Carlos", 'C');
666         tt.insert("Car", 'r');
667         tt.insert("palos", 'l');
668         tt.insert("pa", 'p');
669         tt.trimToSize();
670         System.out.println((char)tt.find("Car"));
671         System.out.println((char)tt.find("Carlos"));
672         System.out.println((char)tt.find("alto"));
673         tt.printStats();
674     }
675
676 }
677
678
Popular Tags