1 27 package org.htmlparser.filters; 28 29 import java.util.regex.Matcher ; 30 import java.util.regex.Pattern ; 31 32 import org.htmlparser.Node; 33 import org.htmlparser.NodeFilter; 34 import org.htmlparser.Tag; 35 import org.htmlparser.util.NodeList; 36 37 44 public class CssSelectorNodeFilter implements NodeFilter 45 { 46 private static Pattern tokens = 47 Pattern.compile("(" 48 + "/\\*.*?\\*/" + ") | (" 50 + " \".*?[^\"]\"" + " | \'.*?[^\']\'" + " | \"\" | \'\' " + ") | (" 54 + " [\\~\\*\\$\\^]? = " + ") | (" 56 + " [a-zA-Z_\\*](?:[a-zA-Z0-9_-]|\\\\.)* " + ") | \\s*(" 58 + " [+>~\\s] " + ")\\s* | (" 60 + " [\\.\\[\\]\\#\\:)(] " + ") | (" 62 + " [\\,] " + ") | ( . )" , 65 Pattern.CASE_INSENSITIVE 66 |Pattern.DOTALL 67 |Pattern.COMMENTS); 68 69 private static final int COMMENT = 1, QUOTEDSTRING = 2, RELATION = 3, 70 NAME = 4, COMBINATOR = 5, DELIM = 6, COMMA = 7; 71 72 private NodeFilter therule; 73 74 public CssSelectorNodeFilter(String selector) 75 { 76 m = tokens.matcher(selector); 77 if (nextToken()) 78 therule = parse(); 79 } 80 81 public boolean accept(Node n) 82 { 83 return therule.accept(n); 84 } 85 86 private Matcher m = null; 87 private int tokentype = 0; 88 private String token = null; 89 90 private boolean nextToken() 91 { 92 if (m != null && m.find()) 93 for (int i = 1; i < m.groupCount(); i++) 94 if (m.group(i) != null) 95 { 96 tokentype = i; 97 token = m.group(i); 98 return true; 99 } 100 tokentype = 0; 101 token = null; 102 return false; 103 } 104 105 private NodeFilter parse() 106 { 107 NodeFilter n = null; 108 do 109 { 110 switch (tokentype) 111 { 112 case COMMENT: 113 case NAME: 114 case DELIM: 115 if (n == null) 116 n = parseSimple(); 117 else 118 n = new AndFilter(n, parseSimple()); 119 break; 120 case COMBINATOR: 121 switch (token.charAt(0)) 122 { 123 case '+': 124 n = new AdjacentFilter(n); 125 break; 126 case '>': 127 n = new HasParentFilter(n); 128 break; 129 default: n = new HasAncestorFilter(n); 131 } 132 nextToken(); 133 break; 134 case COMMA: 135 n = new OrFilter(n, parse()); 136 nextToken(); 137 break; 138 } 139 } 140 while (token != null); 141 return n; 142 } 143 144 private NodeFilter parseSimple() 145 { 146 boolean done = false; 147 NodeFilter n = null; 148 149 if (token != null) 150 do 151 { 152 switch (tokentype) 153 { 154 case COMMENT: 155 nextToken(); 156 break; 157 case NAME: 158 if ("*".equals(token)) 159 n = new YesFilter(); 160 else if (n == null) 161 n = new TagNameFilter(unescape(token)); 162 else 163 n = new AndFilter(n, new TagNameFilter(unescape(token))); 164 nextToken(); 165 break; 166 case DELIM: 167 switch (token.charAt(0)) 168 { 169 case '.': 170 nextToken(); 171 if (tokentype != NAME) 172 throw new IllegalArgumentException ("Syntax error at " + token); 173 if (n == null) 174 n = new HasAttributeFilter("class", unescape(token)); 175 else 176 n 177 = new AndFilter(n, new HasAttributeFilter("class", unescape(token))); 178 break; 179 case '#': 180 nextToken(); 181 if (tokentype != NAME) 182 throw new IllegalArgumentException ("Syntax error at " + token); 183 if (n == null) 184 n = new HasAttributeFilter("id", unescape(token)); 185 else 186 n = new AndFilter(n, new HasAttributeFilter("id", unescape(token))); 187 break; 188 case ':': 189 nextToken(); 190 if (n == null) 191 n = parsePseudoClass(); 192 else 193 n = new AndFilter(n, parsePseudoClass()); 194 break; 195 case '[': 196 nextToken(); 197 if (n == null) 198 n = parseAttributeExp(); 199 else 200 n = new AndFilter(n, parseAttributeExp()); 201 break; 202 } 203 nextToken(); 204 break; 205 default: 206 done = true; 207 } 208 } 209 while (!done && token != null); 210 return n; 211 } 212 213 private NodeFilter parsePseudoClass() 214 { 215 throw new IllegalArgumentException ("pseudoclasses not implemented yet"); 216 } 217 218 private NodeFilter parseAttributeExp() 219 { 220 NodeFilter n = null; 221 if (tokentype == NAME) 222 { 223 String attrib = token; 224 nextToken(); 225 if ("]".equals(token)) 226 n = new HasAttributeFilter(unescape(attrib)); 227 else if (tokentype == RELATION) 228 { 229 String val = null, rel = token; 230 nextToken(); 231 if (tokentype == QUOTEDSTRING) 232 val = unescape(token.substring(1, token.length() - 1)); 233 else if (tokentype == NAME) 234 val = unescape(token); 235 if ("~=".equals(rel) && val != null) 236 n = new AttribMatchFilter(unescape(attrib), 237 "\\b" 238 + val.replaceAll("([^a-zA-Z0-9])", "\\\\$1") 239 + "\\b"); 240 else if ("=".equals(rel) && val != null) 241 n = new HasAttributeFilter(attrib, val); 242 } 243 } 244 if (n == null) 245 throw new IllegalArgumentException ("Syntax error at " + token + tokentype); 246 247 nextToken(); 248 return n; 249 } 250 251 public static String unescape(String escaped) 252 { 253 StringBuffer result = new StringBuffer (escaped.length()); 254 Matcher m = Pattern.compile("\\\\(?:([a-fA-F0-9]{2,6})|(.))").matcher( 255 escaped); 256 while (m.find()) 257 { 258 if (m.group(1) != null) 259 m.appendReplacement(result, 260 String.valueOf((char)Integer.parseInt(m.group(1), 16))); 261 else if (m.group(2) != null) 262 m.appendReplacement(result, m.group(2)); 263 } 264 m.appendTail(result); 265 266 return result.toString(); 267 } 268 269 private static class HasAncestorFilter implements NodeFilter 270 { 271 private NodeFilter atest; 272 273 public HasAncestorFilter(NodeFilter n) 274 { 275 atest = n; 276 } 277 278 public boolean accept(Node n) 279 { 280 while (n != null) 281 { 282 n = n.getParent(); 283 if (atest.accept(n)) 284 return true; 285 } 286 return false; 287 } 288 } 289 290 private static class AdjacentFilter implements NodeFilter 291 { 292 private NodeFilter sibtest; 293 294 public AdjacentFilter(NodeFilter n) 295 { 296 sibtest = n; 297 } 298 299 public boolean accept(Node n) 300 { 301 if (n.getParent() != null) 302 { 303 NodeList l = n.getParent().getChildren(); 304 for (int i = 0; i < l.size(); i++) 305 if (l.elementAt(i) == n && i > 0) 306 return (sibtest.accept(l.elementAt(i - 1))); 307 } 308 return false; 309 } 310 } 311 312 private static class YesFilter implements NodeFilter 313 { 314 public boolean accept(Node n) 315 {return true;} 316 } 317 318 private static class AttribMatchFilter implements NodeFilter 319 { 320 private Pattern rel; 321 private String attrib; 322 323 public AttribMatchFilter(String attrib, String regex) 324 { 325 rel = Pattern.compile(regex); 326 this.attrib = attrib; 327 } 328 329 public boolean accept(Node node) 330 { 331 if (node instanceof Tag && ((Tag)node).getAttribute(attrib) != null) 332 if (rel != null 333 && !rel.matcher(((Tag)node).getAttribute(attrib)).find()) 334 return false; 335 else 336 return true; 337 else 338 return false; 339 } 340 } 341 } 342 | Popular Tags |