KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > filters > CssSelectorNodeFilter


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Rogers George
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/filters/CssSelectorNodeFilter.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2004/07/17 13:45:04 $
10
// $Revision: 1.4 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.filters;
28
29 import java.util.regex.Matcher JavaDoc;
30 import java.util.regex.Pattern JavaDoc;
31
32 import org.htmlparser.Node;
33 import org.htmlparser.NodeFilter;
34 import org.htmlparser.Tag;
35 import org.htmlparser.util.NodeList;
36
37 /**
38  * A NodeFilter that accepts nodes based on whether they match a CSS2 selector.
39  * Refer to <a HREF="http://www.w3.org/TR/REC-CSS2/selector.html">
40  * http://www.w3.org/TR/REC-CSS2/selector.html</a> for syntax.
41  * <p>
42  * Todo: more thorough testing, any relevant pseudo-classes, css3 features
43  */

44 public class CssSelectorNodeFilter implements NodeFilter
45 {
46     private static Pattern JavaDoc tokens =
47         Pattern.compile("("
48             + "/\\*.*?\\*/" // comments
49
+ ") | ("
50             + " \".*?[^\"]\"" // double quoted string
51
+ " | \'.*?[^\']\'" // single quoted string
52
+ " | \"\" | \'\' " // empty quoted string
53
+ ") | ("
54             + " [\\~\\*\\$\\^]? = " // attrib-val relations
55
+ ") | ("
56             + " [a-zA-Z_\\*](?:[a-zA-Z0-9_-]|\\\\.)* " // bare name
57
+ ") | \\s*("
58             + " [+>~\\s] " // combinators
59
+ ")\\s* | ("
60             + " [\\.\\[\\]\\#\\:)(] " // class/id/attr/param delims
61
+ ") | ("
62             + " [\\,] " // comma
63
+ ") | ( . )" // everything else (bogus)
64
,
65             Pattern.CASE_INSENSITIVE
66             |Pattern.DOTALL
67             |Pattern.COMMENTS);
68
69     private static final int COMMENT = 1, QUOTEDSTRING = 2, RELATION = 3,
70         NAME = 4, COMBINATOR = 5, DELIM = 6, COMMA = 7;
71
72     private NodeFilter therule;
73
74     public CssSelectorNodeFilter(String JavaDoc selector)
75     {
76         m = tokens.matcher(selector);
77         if (nextToken())
78             therule = parse();
79     }
80
81     public boolean accept(Node n)
82     {
83         return therule.accept(n);
84     }
85
86     private Matcher JavaDoc m = null;
87     private int tokentype = 0;
88     private String JavaDoc token = null;
89
90     private boolean nextToken()
91     {
92         if (m != null && m.find())
93             for (int i = 1; i < m.groupCount(); i++)
94                 if (m.group(i) != null)
95                 {
96                     tokentype = i;
97                     token = m.group(i);
98                     return true;
99                 }
100         tokentype = 0;
101         token = null;
102         return false;
103     }
104
105     private NodeFilter parse()
106     {
107         NodeFilter n = null;
108         do
109         {
110             switch (tokentype)
111             {
112                 case COMMENT:
113                 case NAME:
114                 case DELIM:
115                     if (n == null)
116                         n = parseSimple();
117                     else
118                         n = new AndFilter(n, parseSimple());
119                     break;
120                 case COMBINATOR:
121                     switch (token.charAt(0))
122                     {
123                         case '+':
124                             n = new AdjacentFilter(n);
125                             break;
126                         case '>':
127                             n = new HasParentFilter(n);
128                             break;
129                         default: // whitespace
130
n = new HasAncestorFilter(n);
131                     }
132                     nextToken();
133                     break;
134                 case COMMA:
135                     n = new OrFilter(n, parse());
136                     nextToken();
137                     break;
138             }
139         }
140         while (token != null);
141         return n;
142     }
143
144     private NodeFilter parseSimple()
145     {
146         boolean done = false;
147         NodeFilter n = null;
148
149         if (token != null)
150             do
151             {
152                 switch (tokentype)
153                 {
154                     case COMMENT:
155                         nextToken();
156                         break;
157                     case NAME:
158                         if ("*".equals(token))
159                             n = new YesFilter();
160                         else if (n == null)
161                             n = new TagNameFilter(unescape(token));
162                         else
163                             n = new AndFilter(n, new TagNameFilter(unescape(token)));
164                         nextToken();
165                         break;
166                     case DELIM:
167                         switch (token.charAt(0))
168                         {
169                             case '.':
170                                 nextToken();
171                                 if (tokentype != NAME)
172                                     throw new IllegalArgumentException JavaDoc("Syntax error at " + token);
173                                 if (n == null)
174                                     n = new HasAttributeFilter("class", unescape(token));
175                                 else
176                                     n
177                                     = new AndFilter(n, new HasAttributeFilter("class", unescape(token)));
178                                 break;
179                             case '#':
180                                 nextToken();
181                                 if (tokentype != NAME)
182                                     throw new IllegalArgumentException JavaDoc("Syntax error at " + token);
183                                 if (n == null)
184                                     n = new HasAttributeFilter("id", unescape(token));
185                                 else
186                                     n = new AndFilter(n, new HasAttributeFilter("id", unescape(token)));
187                                 break;
188                             case ':':
189                                 nextToken();
190                                 if (n == null)
191                                     n = parsePseudoClass();
192                                 else
193                                     n = new AndFilter(n, parsePseudoClass());
194                                 break;
195                             case '[':
196                                 nextToken();
197                                 if (n == null)
198                                     n = parseAttributeExp();
199                                 else
200                                     n = new AndFilter(n, parseAttributeExp());
201                                 break;
202                         }
203                         nextToken();
204                         break;
205                     default:
206                         done = true;
207                 }
208             }
209             while (!done && token != null);
210         return n;
211     }
212
213     private NodeFilter parsePseudoClass()
214     {
215         throw new IllegalArgumentException JavaDoc("pseudoclasses not implemented yet");
216     }
217
218     private NodeFilter parseAttributeExp()
219     {
220         NodeFilter n = null;
221         if (tokentype == NAME)
222         {
223             String JavaDoc attrib = token;
224             nextToken();
225             if ("]".equals(token))
226                 n = new HasAttributeFilter(unescape(attrib));
227             else if (tokentype == RELATION)
228             {
229                 String JavaDoc val = null, rel = token;
230                 nextToken();
231                 if (tokentype == QUOTEDSTRING)
232                     val = unescape(token.substring(1, token.length() - 1));
233                 else if (tokentype == NAME)
234                     val = unescape(token);
235                 if ("~=".equals(rel) && val != null)
236                     n = new AttribMatchFilter(unescape(attrib),
237                                                                         "\\b"
238                                                                         + val.replaceAll("([^a-zA-Z0-9])", "\\\\$1")
239                                                                         + "\\b");
240                 else if ("=".equals(rel) && val != null)
241                     n = new HasAttributeFilter(attrib, val);
242             }
243         }
244         if (n == null)
245             throw new IllegalArgumentException JavaDoc("Syntax error at " + token + tokentype);
246
247         nextToken();
248         return n;
249     }
250
251     public static String JavaDoc unescape(String JavaDoc escaped)
252     {
253         StringBuffer JavaDoc result = new StringBuffer JavaDoc(escaped.length());
254         Matcher JavaDoc m = Pattern.compile("\\\\(?:([a-fA-F0-9]{2,6})|(.))").matcher(
255                         escaped);
256         while (m.find())
257         {
258             if (m.group(1) != null)
259                 m.appendReplacement(result,
260                                                         String.valueOf((char)Integer.parseInt(m.group(1), 16)));
261             else if (m.group(2) != null)
262                 m.appendReplacement(result, m.group(2));
263         }
264         m.appendTail(result);
265
266         return result.toString();
267     }
268
269     private static class HasAncestorFilter implements NodeFilter
270     {
271         private NodeFilter atest;
272
273         public HasAncestorFilter(NodeFilter n)
274         {
275             atest = n;
276         }
277
278         public boolean accept(Node n)
279         {
280             while (n != null)
281             {
282                 n = n.getParent();
283                 if (atest.accept(n))
284                     return true;
285             }
286             return false;
287         }
288     }
289
290     private static class AdjacentFilter implements NodeFilter
291     {
292         private NodeFilter sibtest;
293
294         public AdjacentFilter(NodeFilter n)
295         {
296             sibtest = n;
297         }
298
299         public boolean accept(Node n)
300         {
301             if (n.getParent() != null)
302             {
303                 NodeList l = n.getParent().getChildren();
304                 for (int i = 0; i < l.size(); i++)
305                     if (l.elementAt(i) == n && i > 0)
306                         return (sibtest.accept(l.elementAt(i - 1)));
307             }
308             return false;
309         }
310     }
311
312     private static class YesFilter implements NodeFilter
313     {
314         public boolean accept(Node n)
315         {return true;}
316     }
317
318     private static class AttribMatchFilter implements NodeFilter
319     {
320         private Pattern JavaDoc rel;
321         private String JavaDoc attrib;
322
323         public AttribMatchFilter(String JavaDoc attrib, String JavaDoc regex)
324         {
325             rel = Pattern.compile(regex);
326             this.attrib = attrib;
327         }
328
329         public boolean accept(Node node)
330         {
331             if (node instanceof Tag && ((Tag)node).getAttribute(attrib) != null)
332                 if (rel != null
333                         && !rel.matcher(((Tag)node).getAttribute(attrib)).find())
334                     return false;
335                 else
336                     return true;
337             else
338                 return false;
339         }
340     }
341 }
342
Popular Tags