BoyerMooreSearchMatcher


1   /*
2    * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
3    *         Boyer-Moore algorithm
4    * :tabSize=8:indentSize=8:noTabs=false:
5    * :folding=explicit:collapseFolds=1:
6    *
7    * Copyright (C) 1999, 2000 mike dillon
8    * Portions copyright (C) 2001 Tom Locke
9    * Portions copyright (C) 2001, 2002 Slava Pestov
10   *
11   * This program is free software; you can redistribute it and/or
12   * modify it under the terms of the GNU General Public License
13   * as published by the Free Software Foundation; either version 2
14   * of the License, or any later version.
15   *
16   * This program is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU General Public License for more details.
20   *
21   * You should have received a copy of the GNU General Public License
22   * along with this program; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
24   */
25  
26  package org.gjt.sp.jedit.search;
27  
28  /**
29   * Implements literal search using the Boyer-Moore algorithm.
30   * @version $Id: BoyerMooreSearchMatcher.java 8167 2006-12-02 21:41:53Z kpouer $
31   */
32  public class BoyerMooreSearchMatcher extends SearchMatcher
33  {
34      //{{{ BoyerMooreSearchMatcher constructor
35      /**
36       * Creates a new string literal matcher.
37       * @param pattern the search pattern
38       * @param ignoreCase <code>true</code> if you want to ignore case
39       */
40      public BoyerMooreSearchMatcher(String   pattern, boolean ignoreCase)
41      {
42          this.pattern = pattern.toCharArray();
43          if(ignoreCase)
44          {
45              for(int i = 0; i < this.pattern.length; i++)
46              {
47                  this.pattern[i] = Character.toUpperCase(
48                      this.pattern[i]);
49              }
50          }
51  
52          this.ignoreCase = ignoreCase;
53  
54          pattern_end = this.pattern.length - 1;
55      } //}}}
56  
57      //{{{ nextMatch() method
58      /**
59       * Returns the offset of the first match of the specified text
60       * within this matcher.
61       * @param text The text to search in
62       * @param start True if the start of the segment is the beginning of the
63       * buffer
64       * @param end True if the end of the segment is the end of the buffer
65       * @param firstTime If false and the search string matched at the start
66       * offset with length zero, automatically find next match
67       * @param reverse If true, searching will be performed in a backward
68       * direction.
69       * @return an array where the first element is the start offset
70       * of the match, and the second element is the end offset of
71       * the match
72       * @since jEdit 4.2pre4
73       */
74      public SearchMatcher.Match nextMatch(CharSequence   text,
75          boolean start, boolean end, boolean firstTime,
76          boolean reverse)
77      {
78          int pos = match(text,reverse);
79  
80          if (pos == -1)
81          {
82              return null;
83          }
84          else
85          {
86              returnValue.start = pos;
87              returnValue.end = pos + pattern.length;
88              return returnValue;
89          }
90      } //}}}
91  
92      //{{{ match() method
93      /**
94       *  a good introduction to the Boyer-Moore fast string matching
95       *  algorithm may be found on Moore's website at:
96       *
97       *   http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
98       *
99       * @since jEdit 4.3pre5
100      */
101     public int match(CharSequence   text, boolean reverse)
102     {
103         //{{{
104         // lazily create skip and suffix arrays for either the
105         // search pattern, or the reversed search pattern
106         int[] skip, suffix;
107         if(reverse)
108         {
109             if(back_skip == null)
110             {
111                 back_skip = generateSkipArray(true);
112                 back_suffix = generateSuffixArray(true);
113             }
114             skip = back_skip;
115             suffix = back_suffix;
116         }
117         else
118         {
119             if(fwd_skip == null)
120             {
121                 fwd_skip = generateSkipArray(false);
122                 fwd_suffix = generateSuffixArray(false);
123             }
124             skip = fwd_skip;
125             suffix = fwd_suffix;
126         } //}}}
127 
128         // position variable for pattern test position
129         int pos;
130 
131         // position variable for pattern start
132         int anchor = 0;
133 
134         // last possible start position of a match with this pattern;
135         // this is negative if the pattern is longer than the text
136         // causing the search loop below to immediately fail
137         //int last_anchor = reverseSearch
138         //  ? offset + pattern.length - 1
139         //  : length - pattern.length;
140 
141         char ch = 0;
142 
143         int bad_char;
144         int good_suffix;
145 
146         // the search works by starting the anchor (first character
147         // of the pattern) at the initial offset. as long as the
148         // anchor is far enough from the enough of the text for the
149         // pattern to match, and until the pattern matches, we
150         // compare the pattern to the text from the last character
151         // to the first character in reverse order. where a character
152         // in the pattern mismatches, we use the two heuristics
153         // based on the mismatch character and its position in the
154         // pattern to determine the furthest we can move the anchor
155         // without missing any potential pattern matches.
156         SEARCH:
157         while (anchor + pattern_end < text.length())
158         {
159             for (pos = pattern_end; pos >= 0; --pos)
160             {
161                 ch = text.charAt(pos + anchor);
162                 if(ignoreCase)
163                     ch = Character.toUpperCase(ch);
164 
165                 // pattern test
166                 if ((reverse ? ch != pattern[pattern_end - pos]
167                     : ch != pattern[pos]))
168                 {
169                     // character mismatch, determine how many characters to skip
170 
171                     // heuristic #1
172                     bad_char = pos - skip[getSkipIndex(ch)];
173 
174                     // heuristic #2
175                     good_suffix = suffix[pos];
176 
177                     // skip the greater of the two distances provided by the
178                     // heuristics
179                     int skip_index = (bad_char > good_suffix) ? bad_char : good_suffix;
180                     anchor += skip_index;
181 
182                     // go back to the while loop
183                     continue SEARCH;
184                 }
185             }
186 
187             // MATCH: return the position of its first character
188             return anchor;
189         }
190 
191         // MISMATCH: return -1 as defined by API
192         return -1;
193     } //}}}
194 
195     //{{{ toString() method
196     public String   toString()
197     {
198         return "BoyerMooreSearchMatcher[" + new String  (pattern) + ']';
199     } //}}}
200 
201     //{{{ Private members
202     private char[] pattern;
203     private int pattern_end;
204     private boolean ignoreCase;
205 
206     // Boyer-Moore member fields
207     private int[] fwd_skip;
208     private int[] fwd_suffix;
209     private int[] back_skip;
210     private int[] back_suffix;
211     //}}}
212 
213     // Boyer-Moore helper methods
214 
215     //{{{ generateSkipArray() method
216     /*
217      *  the 'skip' array is used to determine for each index in the
218      *  hashed alphabet how many characters can be skipped if
219      *  a mismatch occurs on a characater hashing to that index.
220      */
221     private int[] generateSkipArray(boolean reverse)
222     {
223         // initialize the skip array to all zeros
224         int[] skip = new int[256];
225 
226         // leave the table cleanly-initialized for an empty pattern
227         if (pattern.length == 0)
228             return skip;
229 
230         int pos = 0;
231 
232         do
233         {
234             skip[getSkipIndex(pattern[reverse ? pattern_end - pos : pos])] = pos;
235         }
236         while (++pos < pattern.length);
237 
238         return skip;
239     } //}}}
240 
241     //{{{ getSkipIndex() method
242     /*
243      *  to avoid our skip table having a length of 2 ^ 16, we hash each
244      *  character of the input into a character in the alphabet [\x00-\xFF]
245      *  using the lower 8 bits of the character's value (resulting in
246      *  a more reasonable skip table of length 2 ^ 8).
247      *
248      *  the result of this is that more than one character can hash to the
249      *  same index, but since the skip table encodes the position of
250      *  occurence of the character furthest into the string with a particular
251      *  index (whether or not it is the only character with that index), an
252      *  index collision only means that that this heuristic will give a
253      *  sub-optimal skip (i.e. a complete skip table could use the differences
254      *  between colliding characters to maximal effect, at the expense of
255      *  building a table that is over 2 orders of magnitude larger and very
256      *  sparse).
257      */
258     private static final int getSkipIndex(char ch)
259     {
260         return ch & 0x000000FF;
261     } //}}}
262 
263     //{{{ generateSuffixArray() method
264     /*
265      *  XXX: hairy code that is basically just a functional(?) port of some
266      *  other code i barely understood
267      */
268     private int[] generateSuffixArray(boolean reverse)
269     {
270         int m = pattern.length;
271 
272         int j = m + 1;
273 
274         int[] suffix = new int[j];
275         int[] tmp = new int[j];
276         tmp[m] = j;
277 
278         for (int i = m; i > 0; --i)
279         {
280             while (j <= m && pattern[reverse ? pattern_end - i + 1 : i - 1]
281                 != pattern[reverse ? pattern_end - j + 1 : j - 1])
282             {
283                 if (suffix[j] == 0)
284                 {
285                     suffix[j] = j - i;
286                 }
287 
288                 j = tmp[j];
289             }
290 
291             tmp[i - 1] = --j;
292         }
293 
294         int k = tmp[0];
295 
296         for (j = 0; j <= m; j++)
297         {
298             // the code above builds a 1-indexed suffix array,
299             // but we shift it to be 0-indexed, ignoring the
300             // original 0-th element
301             if (j > 0)
302             {
303                 suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
304             }
305 
306             if (j == k)
307             {
308                 k = tmp[k];
309             }
310         }
311 
312         return suffix;
313     } //}}}
314 
315     //}}}
316 }
317
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags