KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > Wildcard


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 /**
36  * Wildcard pattern. Wildcards are similar to sh-style file globbing.
37  * A wildcard pattern is implicitly anchored, meaning that it must match the entire string.
38  * The wildcard operators are:
39  * <PRE>
40  * ? matches one arbitrary character
41  * * matches zero or more arbitrary characters
42  * [xyz] matches characters x or y or z
43  * {foo,bar,baz} matches expressions foo or bar or baz
44  * () grouping to extract fields
45  * \ escape one of these special characters
46  * </PRE>
47  * Escape codes (like \n and \t) and Perl5 character classes (like \w and \s) may also be used.
48  */

49 public class Wildcard extends Regexp {
50     String JavaDoc stringRep;
51     
52     public Wildcard (String JavaDoc pattern) {
53         super ("^" + toRegexp (pattern) + "$");
54         stringRep = pattern;
55     }
56
57     public boolean equals (Object JavaDoc object) {
58         if (! (object instanceof Wildcard))
59             return false;
60         Wildcard p = (Wildcard)object;
61         return p.stringRep.equals (stringRep);
62     }
63     
64     public static String JavaDoc toRegexp (String JavaDoc wildcard) {
65         String JavaDoc s = wildcard;
66
67         int inAlternative = 0;
68         int inSet = 0;
69         boolean inEscape = false;
70
71         StringBuffer JavaDoc output = new StringBuffer JavaDoc ();
72
73         int len = s.length ();
74         for (int i=0; i<len; ++i) {
75             char c = s.charAt (i);
76             if (inEscape) {
77                 output.append (c);
78                 inEscape = false;
79             }
80             else {
81                 switch (c) {
82                   case '\\':
83                     output.append (c);
84                     inEscape = true;
85                     break;
86                   case '?':
87                     output.append ('.');
88                     break;
89                   case '*':
90                     output.append (".*");
91                     break;
92                   case '[':
93                     output.append (c);
94                     ++inSet;
95                     break;
96                   case ']':
97                       // FIX: handle [] case properly
98
output.append (c);
99                     --inSet;
100                     break;
101                   case '{':
102                     output.append ("(?:");
103                     ++inAlternative;
104                     break;
105                   case ',':
106                     if (inAlternative > 0)
107                         output.append ("|");
108                     else
109                         output.append (c);
110                     break;
111                   case '}':
112                     output.append (")");
113                     --inAlternative;
114                     break;
115                   case '^':
116                     if (inSet > 0) {
117                         output.append (c);
118                     }
119                     else {
120                         output.append ('\\');
121                         output.append (c);
122                     }
123                     break;
124                   case '$':
125                   case '.':
126                   case '|':
127                   case '+':
128                     output.append ('\\');
129                     output.append (c);
130                     break;
131                   default:
132                     output.append (c);
133                     break;
134                 }
135             }
136         }
137         if (inEscape)
138             output.append ('\\');
139
140         return output.toString ();
141     }
142
143     public static String JavaDoc escape (String JavaDoc s) {
144         return rcm.util.Str.escape (s, '\\', "\\?*{}()[]");
145     }
146     
147     public String JavaDoc toString () {
148         return stringRep;
149     }
150     
151     public static void main (String JavaDoc[] args) throws Exception JavaDoc {
152         if (args.length < 2) {
153             System.err.println ("usage: Wildcard <pattern> <string>*");
154             return;
155         }
156
157         Pattern p = new Wildcard (args[0].replace ('_', ' ') );
158         for (int i=1; i<args.length; ++i) {
159             Region r = p.oneMatch (args[i]);
160             System.out.println (args[i] + ": " + (r != null));
161             if (r != null) {
162                 System.out.println (" [" + r.getStart() + "," + r.getEnd() + "]" + r);
163                 Region[] groups = r.getFields ("websphinx.groups");
164                 if (groups != null)
165                     for (int j=0; j<groups.length; ++j) {
166                         Region s = groups[j];
167                         System.out.println (" "+"[" + s.getStart() + "," + s.getEnd() + "]" + s);
168                     }
169             }
170         }
171     }
172 }
173
Popular Tags