KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > gjt > sp > jedit > search > BoyerMooreSearchMatcher


1 /*
2  * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
3  * Boyer-Moore algorithm
4  * :tabSize=8:indentSize=8:noTabs=false:
5  * :folding=explicit:collapseFolds=1:
6  *
7  * Copyright (C) 1999, 2000 mike dillon
8  * Portions copyright (C) 2001 Tom Locke
9  * Portions copyright (C) 2001, 2002 Slava Pestov
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public License
13  * as published by the Free Software Foundation; either version 2
14  * of the License, or any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
24  */

25
26 package org.gjt.sp.jedit.search;
27
28 /**
29  * Implements literal search using the Boyer-Moore algorithm.
30  * @version $Id: BoyerMooreSearchMatcher.java 8167 2006-12-02 21:41:53Z kpouer $
31  */

32 public class BoyerMooreSearchMatcher extends SearchMatcher
33 {
34     //{{{ BoyerMooreSearchMatcher constructor
35
/**
36      * Creates a new string literal matcher.
37      * @param pattern the search pattern
38      * @param ignoreCase <code>true</code> if you want to ignore case
39      */

40     public BoyerMooreSearchMatcher(String JavaDoc pattern, boolean ignoreCase)
41     {
42         this.pattern = pattern.toCharArray();
43         if(ignoreCase)
44         {
45             for(int i = 0; i < this.pattern.length; i++)
46             {
47                 this.pattern[i] = Character.toUpperCase(
48                     this.pattern[i]);
49             }
50         }
51
52         this.ignoreCase = ignoreCase;
53
54         pattern_end = this.pattern.length - 1;
55     } //}}}
56

57     //{{{ nextMatch() method
58
/**
59      * Returns the offset of the first match of the specified text
60      * within this matcher.
61      * @param text The text to search in
62      * @param start True if the start of the segment is the beginning of the
63      * buffer
64      * @param end True if the end of the segment is the end of the buffer
65      * @param firstTime If false and the search string matched at the start
66      * offset with length zero, automatically find next match
67      * @param reverse If true, searching will be performed in a backward
68      * direction.
69      * @return an array where the first element is the start offset
70      * of the match, and the second element is the end offset of
71      * the match
72      * @since jEdit 4.2pre4
73      */

74     public SearchMatcher.Match nextMatch(CharSequence JavaDoc text,
75         boolean start, boolean end, boolean firstTime,
76         boolean reverse)
77     {
78         int pos = match(text,reverse);
79
80         if (pos == -1)
81         {
82             return null;
83         }
84         else
85         {
86             returnValue.start = pos;
87             returnValue.end = pos + pattern.length;
88             return returnValue;
89         }
90     } //}}}
91

92     //{{{ match() method
93
/**
94      * a good introduction to the Boyer-Moore fast string matching
95      * algorithm may be found on Moore's website at:
96      *
97      * http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
98      *
99      * @since jEdit 4.3pre5
100      */

101     public int match(CharSequence JavaDoc text, boolean reverse)
102     {
103         //{{{
104
// lazily create skip and suffix arrays for either the
105
// search pattern, or the reversed search pattern
106
int[] skip, suffix;
107         if(reverse)
108         {
109             if(back_skip == null)
110             {
111                 back_skip = generateSkipArray(true);
112                 back_suffix = generateSuffixArray(true);
113             }
114             skip = back_skip;
115             suffix = back_suffix;
116         }
117         else
118         {
119             if(fwd_skip == null)
120             {
121                 fwd_skip = generateSkipArray(false);
122                 fwd_suffix = generateSuffixArray(false);
123             }
124             skip = fwd_skip;
125             suffix = fwd_suffix;
126         } //}}}
127

128         // position variable for pattern test position
129
int pos;
130
131         // position variable for pattern start
132
int anchor = 0;
133
134         // last possible start position of a match with this pattern;
135
// this is negative if the pattern is longer than the text
136
// causing the search loop below to immediately fail
137
//int last_anchor = reverseSearch
138
// ? offset + pattern.length - 1
139
// : length - pattern.length;
140

141         char ch = 0;
142
143         int bad_char;
144         int good_suffix;
145
146         // the search works by starting the anchor (first character
147
// of the pattern) at the initial offset. as long as the
148
// anchor is far enough from the enough of the text for the
149
// pattern to match, and until the pattern matches, we
150
// compare the pattern to the text from the last character
151
// to the first character in reverse order. where a character
152
// in the pattern mismatches, we use the two heuristics
153
// based on the mismatch character and its position in the
154
// pattern to determine the furthest we can move the anchor
155
// without missing any potential pattern matches.
156
SEARCH:
157         while (anchor + pattern_end < text.length())
158         {
159             for (pos = pattern_end; pos >= 0; --pos)
160             {
161                 ch = text.charAt(pos + anchor);
162                 if(ignoreCase)
163                     ch = Character.toUpperCase(ch);
164
165                 // pattern test
166
if ((reverse ? ch != pattern[pattern_end - pos]
167                     : ch != pattern[pos]))
168                 {
169                     // character mismatch, determine how many characters to skip
170

171                     // heuristic #1
172
bad_char = pos - skip[getSkipIndex(ch)];
173
174                     // heuristic #2
175
good_suffix = suffix[pos];
176
177                     // skip the greater of the two distances provided by the
178
// heuristics
179
int skip_index = (bad_char > good_suffix) ? bad_char : good_suffix;
180                     anchor += skip_index;
181
182                     // go back to the while loop
183
continue SEARCH;
184                 }
185             }
186
187             // MATCH: return the position of its first character
188
return anchor;
189         }
190
191         // MISMATCH: return -1 as defined by API
192
return -1;
193     } //}}}
194

195     //{{{ toString() method
196
public String JavaDoc toString()
197     {
198         return "BoyerMooreSearchMatcher[" + new String JavaDoc(pattern) + ']';
199     } //}}}
200

201     //{{{ Private members
202
private char[] pattern;
203     private int pattern_end;
204     private boolean ignoreCase;
205
206     // Boyer-Moore member fields
207
private int[] fwd_skip;
208     private int[] fwd_suffix;
209     private int[] back_skip;
210     private int[] back_suffix;
211     //}}}
212

213     // Boyer-Moore helper methods
214

215     //{{{ generateSkipArray() method
216
/*
217      * the 'skip' array is used to determine for each index in the
218      * hashed alphabet how many characters can be skipped if
219      * a mismatch occurs on a characater hashing to that index.
220      */

221     private int[] generateSkipArray(boolean reverse)
222     {
223         // initialize the skip array to all zeros
224
int[] skip = new int[256];
225
226         // leave the table cleanly-initialized for an empty pattern
227
if (pattern.length == 0)
228             return skip;
229
230         int pos = 0;
231
232         do
233         {
234             skip[getSkipIndex(pattern[reverse ? pattern_end - pos : pos])] = pos;
235         }
236         while (++pos < pattern.length);
237
238         return skip;
239     } //}}}
240

241     //{{{ getSkipIndex() method
242
/*
243      * to avoid our skip table having a length of 2 ^ 16, we hash each
244      * character of the input into a character in the alphabet [\x00-\xFF]
245      * using the lower 8 bits of the character's value (resulting in
246      * a more reasonable skip table of length 2 ^ 8).
247      *
248      * the result of this is that more than one character can hash to the
249      * same index, but since the skip table encodes the position of
250      * occurence of the character furthest into the string with a particular
251      * index (whether or not it is the only character with that index), an
252      * index collision only means that that this heuristic will give a
253      * sub-optimal skip (i.e. a complete skip table could use the differences
254      * between colliding characters to maximal effect, at the expense of
255      * building a table that is over 2 orders of magnitude larger and very
256      * sparse).
257      */

258     private static final int getSkipIndex(char ch)
259     {
260         return ch & 0x000000FF;
261     } //}}}
262

263     //{{{ generateSuffixArray() method
264
/*
265      * XXX: hairy code that is basically just a functional(?) port of some
266      * other code i barely understood
267      */

268     private int[] generateSuffixArray(boolean reverse)
269     {
270         int m = pattern.length;
271
272         int j = m + 1;
273
274         int[] suffix = new int[j];
275         int[] tmp = new int[j];
276         tmp[m] = j;
277
278         for (int i = m; i > 0; --i)
279         {
280             while (j <= m && pattern[reverse ? pattern_end - i + 1 : i - 1]
281                 != pattern[reverse ? pattern_end - j + 1 : j - 1])
282             {
283                 if (suffix[j] == 0)
284                 {
285                     suffix[j] = j - i;
286                 }
287
288                 j = tmp[j];
289             }
290
291             tmp[i - 1] = --j;
292         }
293
294         int k = tmp[0];
295
296         for (j = 0; j <= m; j++)
297         {
298             // the code above builds a 1-indexed suffix array,
299
// but we shift it to be 0-indexed, ignoring the
300
// original 0-th element
301
if (j > 0)
302             {
303                 suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
304             }
305
306             if (j == k)
307             {
308                 k = tmp[k];
309             }
310         }
311
312         return suffix;
313     } //}}}
314

315     //}}}
316
}
317
Popular Tags