KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > util > TrieStringMatcher


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.util;
5
6
7 import java.util.Arrays JavaDoc;
8 import java.util.LinkedList JavaDoc;
9 import java.util.ListIterator JavaDoc;
10
11 /**
12  * TrieStringMatcher is a base class for simple tree-based string
13  * matching.
14  *
15  */

16 public abstract class TrieStringMatcher {
17   protected TrieNode root;
18
19   protected TrieStringMatcher() {
20     this.root= new TrieNode('\000', false);
21   }
22
23   /**
24    * Node class for the character tree.
25    */

26   protected class TrieNode implements Comparable JavaDoc {
27     protected TrieNode[] children;
28     protected LinkedList JavaDoc childrenList;
29     protected char nodeChar;
30     protected boolean terminal;
31
32     /**
33      * Creates a new TrieNode, which contains the given
34      * <code>nodeChar</code>. If <code>isTerminal</code> is
35      * <code>true</code>, the new node is a <em>terminal</em> node in
36      * the trie.
37      */

38     TrieNode(char nodeChar, boolean isTerminal) {
39       this.nodeChar= nodeChar;
40       this.terminal= isTerminal;
41       this.childrenList= new LinkedList JavaDoc();
42     }
43
44     /**
45      * Returns <code>true</code> if this node is a <em>terminal</em>
46      * node in the trie.
47      */

48     boolean isTerminal() {
49       return terminal;
50     }
51
52     /**
53      * Returns the child node of this node whose node-character is
54      * <code>nextChar</code>. If no such node exists, one will be is
55      * added. If <em>isTerminal</em> is <code>true</code>, the node
56      * will be a terminal node in the trie.
57      */

58     TrieNode getChildAddIfNotPresent(char nextChar, boolean isTerminal) {
59       if (childrenList == null) {
60         childrenList= new LinkedList JavaDoc();
61         childrenList.addAll(Arrays.asList(children));
62         children= null;
63       }
64
65       if (childrenList.size() == 0) {
66         TrieNode newNode= new TrieNode(nextChar, isTerminal);
67         childrenList.add(newNode);
68         return newNode;
69       }
70
71       ListIterator JavaDoc iter= childrenList.listIterator();
72       TrieNode node= (TrieNode) iter.next();
73       while ( (node.nodeChar < nextChar) && iter.hasNext() )
74         node= (TrieNode) iter.next();
75                         
76       if (node.nodeChar == nextChar) {
77         node.terminal= node.terminal | isTerminal;
78         return node;
79       }
80
81       if (node.nodeChar > nextChar)
82         iter.previous();
83
84       TrieNode newNode= new TrieNode(nextChar, isTerminal);
85       iter.add(newNode);
86       return newNode;
87     }
88
89     /**
90      * Returns the child node of this node whose node-character is
91      * <code>nextChar</code>. If no such node exists,
92      * <code>null</code> is returned.
93      */

94     TrieNode getChild(char nextChar) {
95       if (children == null) {
96         children= (TrieNode[])
97           childrenList.toArray(new TrieNode[childrenList.size()]);
98         childrenList= null;
99         Arrays.sort(children);
100       }
101
102       int min= 0;
103       int max= children.length - 1;
104       int mid= 0;
105       while (min < max) {
106         mid= (min + max) / 2;
107         if (children[mid].nodeChar == nextChar)
108           return children[mid];
109         if (children[mid].nodeChar < nextChar)
110           min= mid + 1;
111         else // if (children[mid].nodeChar > nextChar)
112
max= mid - 1;
113       }
114
115       if (min == max)
116         if (children[min].nodeChar == nextChar)
117           return children[min];
118
119       return null;
120     }
121
122     public int compareTo(Object JavaDoc o) {
123       TrieNode other= (TrieNode) o;
124       if (this.nodeChar < other.nodeChar)
125         return -1;
126       if (this.nodeChar == other.nodeChar)
127         return 0;
128 // if (this.nodeChar > other.nodeChar)
129
return 1;
130     }
131   }
132
133   /**
134    * Returns the next {@link TrieNode} visited, given that you are at
135    * <code>node</code>, and the the next character in the input is
136    * the <code>idx</code>'th character of <code>s</code>.
137    */

138   protected final TrieNode matchChar(TrieNode node, String JavaDoc s, int idx) {
139     return node.getChild(s.charAt(idx));
140   }
141
142   /**
143    * Adds any necessary nodes to the trie so that the given
144    * <code>String</code> can be decoded and the last character is
145    * represented by a terminal node. Zero-length <code>Strings</code>
146    * are ignored.
147    */

148   protected final void addPatternForward(String JavaDoc s) {
149     TrieNode node= root;
150     int stop= s.length() - 1;
151     int i;
152     if (s.length() > 0) {
153       for (i= 0; i < stop; i++)
154         node= node.getChildAddIfNotPresent(s.charAt(i), false);
155       node= node.getChildAddIfNotPresent(s.charAt(i), true);
156     }
157   }
158
159   /**
160    * Adds any necessary nodes to the trie so that the given
161    * <code>String</code> can be decoded <em>in reverse</em> and the
162    * first character is represented by a terminal node. Zero-length
163    * <code>Strings</code> are ignored.
164    */

165   protected final void addPatternBackward(String JavaDoc s) {
166     TrieNode node= root;
167     if (s.length() > 0) {
168       for (int i= s.length()-1; i > 0; i--)
169         node= node.getChildAddIfNotPresent(s.charAt(i), false);
170       node= node.getChildAddIfNotPresent(s.charAt(0), true);
171     }
172   }
173
174   /**
175    * Returns true if the given <code>String</code> is matched by a
176    * pattern in the trie
177    */

178   public abstract boolean matches(String JavaDoc input);
179
180   /**
181    * Returns the shortest substring of <code>input<code> that is
182    * matched by a pattern in the trie, or <code>null<code> if no match
183    * exists.
184    */

185   public abstract String JavaDoc shortestMatch(String JavaDoc input);
186
187   /**
188    * Returns the longest substring of <code>input<code> that is
189    * matched by a pattern in the trie, or <code>null<code> if no match
190    * exists.
191    */

192   public abstract String JavaDoc longestMatch(String JavaDoc input);
193
194 }
195
Popular Tags