1 17 18 19 20 package org.apache.lenya.search.crawler; 21 22 import java.io.BufferedReader ; 23 import java.io.InputStream ; 24 import java.io.InputStreamReader ; 25 import java.text.ParseException ; 26 import java.text.SimpleDateFormat ; 27 import java.util.ArrayList ; 28 import java.util.List ; 29 30 import javax.swing.text.MutableAttributeSet ; 31 import javax.swing.text.html.HTML ; 32 import javax.swing.text.html.HTML.Tag; 33 import javax.swing.text.html.HTMLEditorKit.ParserCallback; 34 import javax.swing.text.html.parser.ParserDelegator ; 35 36 37 40 public final class HTMLHandler extends ParserCallback implements ContentHandler { 41 private static final char space = ' '; 42 private static final char NONE = 0; 43 private static final char TITLE = 1; 44 private static final char HREF = 2; 45 private static final char SCRIPT = 3; 46 private static ParserDelegator pd = new ParserDelegator (); 47 48 private String title; 50 private String description; 51 private String keywords; 52 private String categories; 53 private long published; 54 private String href; 55 private String author; 56 private StringBuffer contents; 57 private ArrayList links; 58 59 private boolean robotIndex; 61 private boolean robotFollow; 62 private char state; 63 private SimpleDateFormat dateFormatter; 64 65 68 public HTMLHandler() { 69 contents = new StringBuffer (); 70 71 links = new ArrayList (); 72 73 published = -1; 74 75 dateFormatter = new SimpleDateFormat ("yyyy.MM.dd HH:mm:ss z"); 77 } 78 79 84 public String getAuthor() { 85 return author; 86 } 87 88 93 public String getCategories() { 94 return this.categories; 95 } 96 97 102 public String getContents() { 103 return this.contents.toString(); 104 } 105 106 111 public String getDescription() { 112 return this.description; 113 } 114 115 120 public String getHREF() { 121 return this.href; 122 } 123 124 129 public String getKeywords() { 130 return this.keywords; 131 } 132 133 138 public List getLinks() { 139 return links; 140 } 141 142 147 public long getPublished() { 148 return this.published; 149 } 150 151 156 public boolean getRobotFollow() { 157 return this.robotFollow; 158 } 159 160 165 public boolean getRobotIndex() { 166 return this.robotIndex; 167 } 168 169 174 public String getTitle() { 175 return this.title; 176 } 177 178 183 public void handleAnchor(MutableAttributeSet attribs) { 184 String href = new String (); 185 186 href = (String ) attribs.getAttribute(HTML.Attribute.HREF); 187 188 if (href == null) { 189 return; 190 } 191 192 links.add(href); 193 194 state = HREF; 195 } 196 197 203 public void handleEndTag(Tag tag, int pos) { 204 if (state == NONE) { 205 return; 206 } 207 208 if ((state == TITLE) && tag.equals(HTML.Tag.TITLE)) { 210 state = NONE; 211 212 return; 213 } 214 215 if ((state == HREF) && tag.equals(HTML.Tag.A)) { 216 state = NONE; 218 219 return; 220 } 221 222 if ((state == SCRIPT) && tag.equals(HTML.Tag.SCRIPT)) { 223 state = NONE; 224 225 return; 226 } 227 } 228 229 234 public void handleMeta(MutableAttributeSet attribs) { 235 String name = new String (); 236 237 String content = new String (); 238 239 name = (String ) attribs.getAttribute(HTML.Attribute.NAME); 240 241 content = (String ) attribs.getAttribute(HTML.Attribute.CONTENT); 242 243 if ((name == null) || (content == null)) { 244 return; 245 } 246 247 name = name.toUpperCase(); 248 249 if (name.equals("DESCRIPTION")) { 250 description = content; 251 252 return; 253 } 254 255 if (name.equals("KEYWORDS")) { 256 keywords = content; 257 258 return; 259 } 260 261 if (name.equals("CATEGORIES")) { 262 categories = content; 263 264 return; 265 } 266 267 if (name.equals("PUBLISHED")) { 268 try { 269 published = dateFormatter.parse(content).getTime(); 270 } catch (ParseException e) { 271 e.printStackTrace(); 272 } 273 274 return; 275 } 276 277 if (name.equals("HREF")) { 278 href = content; 279 280 return; 281 } 282 283 if (name.equals("AUTHOR")) { 284 author = content; 285 286 return; 287 } 288 289 if (name.equals("ROBOTS")) { 290 if (content.indexOf("noindex") != -1) { 291 robotIndex = false; 292 } 293 294 if (content.indexOf("nofollow") != -1) { 295 robotFollow = false; 296 } 297 298 author = content; 299 300 return; 301 } 302 } 303 304 311 public void handleSimpleTag(Tag tag, MutableAttributeSet attribs, int pos) { 312 if (tag.equals(HTML.Tag.META)) { 313 handleMeta(attribs); 314 } 315 } 316 317 324 public void handleStartTag(Tag tag, MutableAttributeSet attribs, int pos) { 325 if (tag.equals(HTML.Tag.TITLE)) { 326 state = TITLE; 327 } else if (tag.equals(HTML.Tag.A)) { 328 handleAnchor(attribs); 329 } else if (tag.equals(HTML.Tag.SCRIPT)) { 330 state = SCRIPT; 331 } 332 } 333 334 340 public void handleText(char[] text, int pos) { 341 switch (state) { 342 case NONE: 343 contents.append(text); 344 contents.append(space); 345 346 break; 347 348 case TITLE: 349 title = new String (text); 350 351 break; 352 353 case HREF: 354 contents.append(text); 355 contents.append(space); 356 357 break; 359 } 360 } 361 362 367 public void parse(InputStream in) { 368 try { 369 reset(); 370 371 pd.parse(new BufferedReader (new InputStreamReader (in)), this, true); 372 } catch (Exception e) { 373 e.printStackTrace(); 374 } 375 } 376 377 380 private void reset() { 381 title = null; 382 383 description = null; 384 385 keywords = null; 386 387 categories = null; 388 389 href = null; 390 391 author = null; 392 393 contents.setLength(0); 394 395 links = new ArrayList (); 396 397 published = -1; 398 399 robotIndex = true; 401 402 robotFollow = true; 403 404 state = NONE; 405 } 406 } 407 | Popular Tags |