KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > mckoi > database > PatternSearch


1 /**
2  * com.mckoi.database.PatternSearch 05 Sep 1998
3  *
4  * Mckoi SQL Database ( http://www.mckoi.com/database )
5  * Copyright (C) 2000, 2001, 2002 Diehl and Associates, Inc.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * Version 2 as published by the Free Software Foundation.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License Version 2 for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * Version 2 along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19  *
20  * Change Log:
21  *
22  *
23  */

24
25 package com.mckoi.database;
26
27 import com.mckoi.database.global.Types;
28 import com.mckoi.util.IntegerVector;
29 import com.mckoi.util.BlockIntegerList;
30 import com.mckoi.util.IntegerIterator;
31
32
33 /**
34  * This is a static class that performs the operations to do a pattern search
35  * on a given column of a table. The pattern syntax is very simple and follows
36  * that of the SQL standard.
37  * <p>
38  * It works as follows:
39  * The '%' character represents any sequence of characters.
40  * The '_' character represents some character.
41  * <p>
42  * Therefore, the pattern search 'Toby%' will find all rows that start with
43  * the string 'Toby' and end with any sequence of characters. The pattern
44  * 'T% Downer%' will find all names starting with T and containing 'Downer'
45  * somewhere in the end. The pattern '_at' will find all three letter words
46  * ending with 'at'.
47  * <p>
48  * NOTE: A 'ab%' type search is faster than a '%bc' type search. If the start
49  * of the search pattern is unknown then the entire contents of the column
50  * need to be accessed.
51  *
52  * @author Tobias Downer
53  */

54
55 public final class PatternSearch {
56
57   /**
58    * Statics for the tokens.
59    */

60   private final static char ZERO_OR_MORE_CHARS = '%';
61   private final static char ONE_CHAR = '_';
62
63
64   public static boolean testSearch(String JavaDoc pattern, String JavaDoc expression,
65                                    boolean result) {
66     System.out.print("Pattern: ");
67     System.out.println("'" + pattern + "'");
68     System.out.print("Expression: ");
69     System.out.println("'" + expression + "'");
70
71     boolean tested_as = fullPatternMatch(pattern, expression, '\\');
72     System.out.print("Result: ");
73     System.out.print(tested_as);
74     if (tested_as != result) {
75       System.out.println(" *** FAILED, Expected: " + result + " ***");
76     }
77     else {
78       System.out.println();
79     }
80     return tested_as;
81   }
82
83   public static void main(String JavaDoc[] args) {
84     // Testing the SQL expression parser.
85
testSearch("", "abc", false);
86     testSearch("%", "abc", true);
87     testSearch("a%", "abc", true);
88     testSearch("ab%", "abc", true);
89     testSearch("abc%", "abc", true);
90     testSearch("abcd%", "abc", false);
91     testSearch("abcd", "abc", false);
92     testSearch("abc", "abc", true);
93     testSearch("ab", "abc", false);
94     testSearch("a", "abc", false);
95     testSearch("a_", "abc", false);
96     testSearch("ab_", "abc", true);
97     testSearch("abc_", "abc", false);
98     testSearch("a_c", "abc", true);
99     testSearch("a%bc", "abc", true);
100     testSearch("a%c", "abc", true);
101     testSearch("%c", "abc", true);
102     testSearch("a_bc", "abc", false);
103     testSearch("__c", "abc", true);
104     testSearch("a__", "abc", true);
105
106     testSearch("a\\_\\_", "a__", true);
107     testSearch("a\\__", "a__", true);
108     testSearch("a\\__", "a_b", true);
109     testSearch("\\___", "_ab", true);
110     testSearch("\\_\\__", "_ab", false);
111     testSearch("\\_\\__", "__b", true);
112
113     testSearch("\\%ab", "%ab", true);
114     testSearch("ab\\%", "ab%", true);
115     testSearch("cab\\%", "cab", false);
116     testSearch("cab%", "cab", true);
117
118   }
119
120
121   /**
122    * Returns true if the given character is a wild card (unknown).
123    */

124   private static boolean isWildCard(char ch) {
125     return (ch == ONE_CHAR || ch == ZERO_OR_MORE_CHARS);
126   }
127
128   /**
129    * Matches a pattern against a string and returns true if it matches or
130    * false otherwise. This matches patterns that do not necessarily start
131    * with a wild card unlike the 'patternMatch' method.
132    */

133   public static boolean fullPatternMatch(String JavaDoc pattern, final String JavaDoc str,
134                                          char escape_char) {
135     StringBuffer JavaDoc start = new StringBuffer JavaDoc();
136     String JavaDoc rezt = null;
137     int len = pattern.length();
138     int i = 0;
139     boolean last_escape_char = false;
140     for (; i < len && rezt == null; ++i) {
141       char c = pattern.charAt(i);
142       if (last_escape_char) {
143         last_escape_char = false;
144         start.append(c);
145       }
146       else if (c == escape_char) {
147         last_escape_char = true;
148       }
149       else if (isWildCard(c)) {
150         rezt = pattern.substring(i);
151       }
152       else {
153         start.append(c);
154       }
155     }
156
157     if (rezt == null) {
158       rezt = "";
159     }
160
161     String JavaDoc st = new String JavaDoc(start);
162
163 // System.out.println("--");
164
// System.out.println(str);
165
// System.out.println(st);
166

167     if (str.startsWith(st)) {
168       String JavaDoc str_rezt = str.substring(st.length()); // (i)
169

170       if (rezt.length() > 0) {
171         return patternMatch(rezt, str_rezt, escape_char);
172       }
173       else {
174         return str_rezt.length() == 0;
175       }
176     }
177     else {
178       return false;
179     }
180
181   }
182
183   /**
184    * This is the pattern match recurrsive method. It recurses on each wildcard
185    * expression in the pattern which makes for slightly better efficiency than
186    * a character recurse algorithm. However, patterns such as "_%_a" will
187    * result in many recursive calls.
188    * <p>
189    * Returns true if the pattern matches.
190    * <p>
191    * NOTE: That "_%_" will be less efficient than "__%" and will produce the
192    * same result.
193    * NOTE: It requires that a wild card character is the first character in
194    * the expression.
195    * ISSUE: Pattern optimiser, we should optimise wild cards of type "%__" to
196    * "__%", or "%__%_%_%" to "____%". Optimised forms are identical in
197    * result and more efficient. This optimisation could be performed by the
198    * client during parsing of the LIKE statement.
199    * HACKING ISSUE: Badly formed wild cards may result in hogging of server
200    * side resources.
201    */

202   public static boolean patternMatch(String JavaDoc pattern, String JavaDoc expression,
203                                      char escape_char) {
204
205     // Look at first character in pattern, if it's a ONE_CHAR wildcard then
206
// check expression and pattern match until next wild card.
207

208     if (pattern.charAt(0) == ONE_CHAR) {
209
210       // Else step through each character in pattern and see if it matches up
211
// with the expression until a wild card is found or the end is reached.
212
// When the end of the pattern is reached, 'finished' is set to true.
213

214       int i = 1;
215       boolean finished = (i >= pattern.length() || expression.length() < 1);
216       boolean last_was_escape = false;
217       int checked = 0;
218       while (!finished) {
219         char c = pattern.charAt(i);
220         if (!last_was_escape && c == escape_char) {
221           last_was_escape = true;
222           if (i >= expression.length()) {
223             return false;
224           }
225           ++i;
226         }
227         else if (last_was_escape || !isWildCard(c)) {
228           last_was_escape = false;
229           // If expression and pattern character doesn't match or end of
230
// expression reached, search has failed.
231
if (i >= expression.length() || c != expression.charAt(i)) {
232             return false;
233           }
234           ++i;
235           ++checked;
236         }
237         else {
238           // found a wildcard, so recurse on this wildcard
239
return patternMatch(pattern.substring(i), expression.substring(i),
240                               escape_char);
241         }
242
243         finished = (i >= pattern.length());
244       }
245
246       // The pattern length minus any escaped characters
247
int real_pattern_length = 0;
248       int sz = pattern.length();
249       for (int n = 0; n < sz; ++n) {
250         if (pattern.charAt(n) != escape_char) {
251           ++real_pattern_length;
252         }
253         else {
254           ++n;
255         }
256       }
257
258       // If pattern and expression lengths match then we have walked through
259
// the expression and found a match, otherwise no match.
260

261       return real_pattern_length == expression.length();
262
263     }
264
265     // Therefore we are doing a ZERO_OR_MORE_CHARS wildcard check.
266

267     // If the pattern is '%' (ie. pattern.length() == 1 because it's only 1
268
// character in length (the '%' character)) then it doesn't matter what the
269
// expression is, we have found a match.
270

271     if (pattern.length() == 1) {
272       return true;
273     }
274
275     // Look at following character in pattern, and extract all the characters
276
// before the next wild card.
277

278     StringBuffer JavaDoc next_string = new StringBuffer JavaDoc();
279     int i = 1;
280     boolean finished = (i >= pattern.length());
281     boolean last_was_escape = false;
282     while (!finished) {
283       char next_char = pattern.charAt(i);
284       if (!last_was_escape && next_char == escape_char) {
285         last_was_escape = true;
286         ++i;
287         if (i >= pattern.length()) {
288           finished = true;
289         }
290       }
291       else if (last_was_escape || !isWildCard(next_char)) {
292         last_was_escape = false;
293         next_string.append(next_char);
294         ++i;
295         if (i >= pattern.length()) {
296           finished = true;
297         }
298       }
299       else {
300         finished = true;
301       }
302     }
303
304     String JavaDoc find_string = new String JavaDoc(next_string);
305
306     // Special case optimisation if we have found the end of the pattern, all
307
// we need to do is check if 'find_string' is on the end of the expression.
308
// eg. pattern = "%er", will have a 'find_string' of "er" and it is saying
309
// 'does the expression end with 'er''.
310

311     if (i >= pattern.length()) {
312       return (expression.endsWith(find_string));
313     }
314
315     // Otherwise we must have finished with another wild card.
316
// Try and find 'next_string' in the expression. If its found then
317
// recurse over the next pattern.
318

319     int find_str_length = find_string.length();
320     int str_index = expression.indexOf(find_string, 0);
321
322     while (str_index != -1) {
323
324       boolean matched = patternMatch(
325                       pattern.substring(1 + find_str_length),
326                       expression.substring(str_index + find_str_length),
327                       escape_char);
328
329       if (matched) {
330         return true;
331       }
332
333       str_index = expression.indexOf(find_string, str_index + 1);
334     }
335
336     return false;
337
338   }
339
340   /**
341    * This is the search method. It requires a table to search, a column of the
342    * table, and a pattern. It returns the rows in the table that match the
343    * pattern if any. Pattern searching only works successfully on columns that
344    * are of type Types.DB_STRING.
345    * This works by first reducing the search to all cells that contain the
346    * first section of text. ie. pattern = "Toby% ___ner" will first reduce
347    * search to all rows between "Toby" and "Tobz". This makes for better
348    * efficiency.
349    */

350   static IntegerVector search(Table table, int column, String JavaDoc pattern) {
351     return search(table, column, pattern, '\\');
352   }
353
354   /**
355    * This is the search method. It requires a table to search, a column of the
356    * table, and a pattern. It returns the rows in the table that match the
357    * pattern if any. Pattern searching only works successfully on columns that
358    * are of type Types.DB_STRING.
359    * This works by first reducing the search to all cells that contain the
360    * first section of text. ie. pattern = "Toby% ___ner" will first reduce
361    * search to all rows between "Toby" and "Tobz". This makes for better
362    * efficiency.
363    */

364   static IntegerVector search(Table table, int column, String JavaDoc pattern,
365                               char escape_char) {
366
367     // Get the type for the column
368
TType col_type = table.getDataTableDef().columnAt(column).getTType();
369
370     // If the column type is not a string type then report an error.
371
if (!(col_type instanceof TStringType)) {
372       throw new Error JavaDoc("Unable to perform a pattern search " +
373                       "on a non-String type column.");
374     }
375     TStringType col_string_type = (TStringType) col_type;
376
377     // ---------- Pre Search ----------
378

379     // First perform a 'pre-search' on the head of the pattern. Note that
380
// there may be no head in which case the entire column is searched which
381
// has more potential to be expensive than if there is a head.
382

383     StringBuffer JavaDoc pre_pattern = new StringBuffer JavaDoc();
384     int i = 0;
385     boolean finished = i >= pattern.length();
386     boolean last_is_escape = false;
387
388     while (!finished) {
389       char c = pattern.charAt(i);
390       if (last_is_escape) {
391         last_is_escape = true;
392         pre_pattern.append(c);
393       }
394       else if (c == escape_char) {
395         last_is_escape = true;
396       }
397       else if (!isWildCard(c)) {
398         pre_pattern.append(c);
399
400         ++i;
401         if (i >= pattern.length()) {
402           finished = true;
403         }
404
405       }
406       else {
407         finished = true;
408       }
409     }
410
411     // This is set with the remaining search.
412
String JavaDoc post_pattern;
413
414     // This is our initial search row set. In the second stage, rows are
415
// eliminated from this vector.
416
IntegerVector search_case;
417
418     if (i >= pattern.length()) {
419       // If the pattern has no 'wildcards' then just perform an EQUALS
420
// operation on the column and return the results.
421

422       TObject cell = new TObject(col_type, pattern);
423       return table.selectRows(column, Operator.get("="), cell);
424
425       // RETURN
426
}
427     else if (pre_pattern.length() == 0 ||
428              col_string_type.getLocale() != null) {
429
430       // No pre-pattern easy search :-(. This is either because there is no
431
// pre pattern (it starts with a wild-card) or the locale of the string
432
// is non-lexicographical. In either case, we need to select all from
433
// the column and brute force the search space.
434

435       search_case = table.selectAll(column);
436       post_pattern = pattern;
437
438     }
439     else {
440
441       // Criteria met: There is a pre_pattern, and the column locale is
442
// lexicographical.
443

444       // Great, we can do an upper and lower bound search on our pre-search
445
// set. eg. search between 'Geoff' and 'Geofg' or 'Geoff ' and
446
// 'Geoff\33'
447

448       String JavaDoc lower_bounds = new String JavaDoc(pre_pattern);
449       int next_char = pre_pattern.charAt(i - 1) + 1;
450       pre_pattern.setCharAt(i - 1, (char) next_char);
451       String JavaDoc upper_bounds = new String JavaDoc(pre_pattern);
452
453       post_pattern = pattern.substring(i);
454
455       TObject cell_lower = new TObject(col_type, lower_bounds);
456       TObject cell_upper = new TObject(col_type, upper_bounds);
457       
458       // Select rows between these two points.
459

460       search_case = table.selectRows(column, cell_lower, cell_upper);
461
462     }
463
464     // ---------- Post search ----------
465

466 // [This optimization assumes that (NULL like '%' = true) which is incorrect]
467
// // EFFICIENCY: This is a special case efficiency case.
468
// // If 'post_pattern' is '%' then we have already found all the records in
469
// // our pattern.
470
//
471
// if (post_pattern.equals("%")) {
472
// return search_case;
473
// }
474

475     int pre_index = i;
476
477     // Now eliminate from our 'search_case' any cells that don't match our
478
// search pattern.
479
// Note that by this point 'post_pattern' will start with a wild card.
480
// This follows the specification for the 'patternMatch' method.
481
// EFFICIENCY: This is a brute force iterative search. Perhaps there is
482
// a faster way of handling this?
483

484     BlockIntegerList i_list = new BlockIntegerList(search_case);
485     IntegerIterator iterator = i_list.iterator(0, i_list.size() - 1);
486
487     while (iterator.hasNext()) {
488
489       // Get the expression (the contents of the cell at the given column, row)
490

491       boolean pattern_matches = false;
492       TObject cell = table.getCellContents(column, iterator.next());
493       // Null values doesn't match with anything
494
if (!cell.isNull()) {
495         String JavaDoc expression = cell.getObject().toString();
496         // We must remove the head of the string, which has already been
497
// found from the pre-search section.
498
expression = expression.substring(pre_index);
499         pattern_matches = patternMatch(post_pattern, expression, escape_char);
500       }
501       if (!pattern_matches) {
502         // If pattern does not match then remove this row from the search.
503
iterator.remove();
504       }
505
506     }
507
508     return new IntegerVector(i_list);
509
510   }
511
512   // ---------- Matching against a regular expression ----------
513

514   /**
515    * Matches a string against a regular expression pattern. We use the regex
516    * library as specified in the DatabaseSystem configuration.
517    */

518   static boolean regexMatch(TransactionSystem system,
519                             String JavaDoc pattern, String JavaDoc value) {
520     // If the first character is a '/' then we assume it's a Perl style regular
521
// expression (eg. "/.*[0-9]+\/$/i")
522
if (pattern.startsWith("/")) {
523       int end = pattern.lastIndexOf('/');
524       String JavaDoc pat = pattern.substring(1, end);
525       String JavaDoc ops = pattern.substring(end + 1);
526       return system.getRegexLibrary().regexMatch(pat, ops, value);
527     }
528     else {
529       // Otherwise it's a regular expression with no operators
530
return system.getRegexLibrary().regexMatch(pattern, "", value);
531     }
532   }
533
534   /**
535    * Matches a column of a table against a constant regular expression
536    * pattern. We use the regex library as specified in the DatabaseSystem
537    * configuration.
538    */

539   static IntegerVector regexSearch(Table table, int column, String JavaDoc pattern) {
540     // If the first character is a '/' then we assume it's a Perl style regular
541
// expression (eg. "/.*[0-9]+\/$/i")
542
if (pattern.startsWith("/")) {
543       int end = pattern.lastIndexOf('/');
544       String JavaDoc pat = pattern.substring(1, end);
545       String JavaDoc ops = pattern.substring(end + 1);
546       return table.getDatabase().getSystem().getRegexLibrary().regexSearch(
547                                                 table, column, pat, ops);
548     }
549     else {
550       // Otherwise it's a regular expression with no operators
551
return table.getDatabase().getSystem().getRegexLibrary().regexSearch(
552                                                 table, column, pattern, "");
553     }
554   }
555
556 }
557
Popular Tags