KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > filters > RegexFilter


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2003 Derrick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/filters/RegexFilter.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2005/02/13 20:36:00 $
10
// $Revision: 1.2 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.filters;
28
29 import java.util.regex.Matcher JavaDoc;
30 import java.util.regex.Pattern JavaDoc;
31
32 import org.htmlparser.Node;
33 import org.htmlparser.NodeFilter;
34 import org.htmlparser.Text;
35
36 /**
37  * This filter accepts all string nodes matching a regular expression.
38  * Because this searches {@link org.htmlparser.Text Text} nodes. it is
39  * only useful for finding small fragments of text, where it is
40  * unlikely to be broken up by a tag. To find large fragments of text
41  * you should convert the page to plain text with something like the
42  * {@link org.htmlparser.beans.StringBean StringBean} and then apply
43  * the regular expression.
44  * <p>
45  * For example, to look for dates use:
46  * <pre>
47  * (19|20)\d\d([- \\/.](0[1-9]|1[012])[- \\/.](0[1-9]|[12][0-9]|3[01]))?
48  * </pre>
49  * as in:
50  * <pre>
51  * Parser parser = new Parser ("http://cbc.ca");
52  * RegexFilter filter = new RegexFilter ("(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?");
53  * NodeIterator iterator = parser.extractAllNodesThatMatch (filter).elements ();
54  * </pre>
55  * which matches a date in yyyy-mm-dd format between 1900-01-01 and 2099-12-31,
56  * with a choice of five separators, dash, space, either slash or a period.
57  * The year is matched by (19|20)\d\d which uses alternation to allow the
58  * either 19 or 20 as the first two digits. The round brackets are mandatory.
59  * The month is matched by 0[1-9]|1[012], again enclosed by round brackets
60  * to keep the two options together. By using character classes, the first
61  * option matches a number between 01 and 09, and the second matches 10, 11 or 12.
62  * The last part of the regex consists of three options. The first matches
63  * the numbers 01 through 09, the second 10 through 29, and the third matches 30 or 31.
64  * The day and month are optional, but must occur together because of the ()?
65  * bracketing after the year.
66  */

67 public class RegexFilter implements NodeFilter
68 {
69     /**
70      * Use match() matching strategy.
71      */

72     public static final int MATCH = 1;
73
74     /**
75      * Use lookingAt() match strategy.
76      */

77     public static final int LOOKINGAT = 2;
78
79     /**
80      * Use find() match strategy.
81      */

82     public static final int FIND = 3;
83
84     /**
85      * The regular expression to search for.
86      */

87     protected String JavaDoc mPatternString;
88
89     /**
90      * The compiled regular expression to search for.
91      */

92     protected Pattern JavaDoc mPattern;
93
94     /**
95      * The match strategy.
96      * @see #RegexFilter(String, int)
97      */

98     protected int mStrategy;
99
100     /**
101      * Creates a new instance of RegexFilter that accepts string nodes matching
102      * the regular expression ".*" using the FIND strategy.
103      */

104     public RegexFilter ()
105     {
106         this (".*", FIND);
107     }
108
109     /**
110      * Creates a new instance of RegexFilter that accepts string nodes matching
111      * a regular expression using the FIND strategy.
112      * @param pattern The pattern to search for.
113      */

114     public RegexFilter (String JavaDoc pattern)
115     {
116         this (pattern, FIND);
117     }
118
119     /**
120      * Creates a new instance of RegexFilter that accepts string nodes matching
121      * a regular expression.
122      * @param pattern The pattern to search for.
123      * @param strategy The type of match:
124      * <ol>
125      * <li>{@link #MATCH} use matches() method: attempts to match the entire input sequence against the pattern</li>
126      * <li>{@link #LOOKINGAT} use lookingAt() method: attempts to match the input sequence, starting at the beginning, against the pattern</li>
127      * <li>{@link #FIND} use find() method: scans the input sequence looking for the next subsequence that matches the pattern</li>
128      * </ol>
129      */

130     public RegexFilter (String JavaDoc pattern, int strategy)
131     {
132         setPattern (pattern);
133         setStrategy (strategy);
134     }
135
136     /**
137      * Get the search pattern.
138      * @return Returns the pattern.
139      */

140     public String JavaDoc getPattern ()
141     {
142         return (mPatternString);
143     }
144
145     /**
146      * Set the search pattern.
147      * @param pattern The pattern to set.
148      */

149     public void setPattern (String JavaDoc pattern)
150     {
151         mPatternString = pattern;
152         mPattern = Pattern.compile (pattern);
153     }
154
155     /**
156      * Get the search strategy.
157      * @return Returns the strategy.
158      */

159     public int getStrategy ()
160     {
161         return (mStrategy);
162     }
163
164     /**
165      * Set the search pattern.
166      * @param strategy The strategy to use. One of MATCH, LOOKINGAT or FIND.
167      */

168     public void setStrategy (int strategy)
169     {
170         mStrategy = strategy;
171     }
172
173     /**
174      * Accept string nodes that match the regular expression.
175      * @param node The node to check.
176      */

177     public boolean accept (Node node)
178     {
179         String JavaDoc string;
180         Matcher JavaDoc matcher;
181         boolean ret;
182         
183         ret = false;
184         if (node instanceof Text)
185         {
186             string = ((Text)node).getText ();
187             matcher = mPattern.matcher (string);
188             switch (mStrategy)
189             {
190                 case MATCH:
191                     ret = matcher.matches ();
192                     break;
193                 case LOOKINGAT:
194                     ret = matcher.lookingAt ();
195                     break;
196                 case FIND:
197                     ret = matcher.find ();
198                     break;
199             }
200         }
201
202         return (ret);
203     }
204 }
205
Popular Tags