1 17 18 19 20 package org.apache.lenya.lucene.parser; 21 22 import java.io.Reader ; 23 import java.io.StringReader ; 24 import java.util.ArrayList ; 25 import java.util.Iterator ; 26 import java.util.List ; 27 28 import javax.swing.text.MutableAttributeSet ; 29 import javax.swing.text.html.HTML ; 30 import javax.swing.text.html.HTML.Tag; 31 import javax.swing.text.html.HTMLEditorKit.ParserCallback; 32 33 import org.apache.log4j.Category; 34 35 public class SwingHTMLHandler extends ParserCallback { 36 Category log = Category.getInstance(SwingHTMLHandler.class); 37 38 41 public SwingHTMLHandler() { 42 debug("\n\n\n\n\nCreating " + getClass().getName()); 43 44 startIndexing(); 46 } 47 48 private TagStack tagStack = new TagStack(); 49 50 protected TagStack getStack() { 51 return tagStack; 52 } 53 54 private StringBuffer titleBuffer = new StringBuffer (); 55 private StringBuffer keywordsBuffer = new StringBuffer (); 56 57 60 protected void appendToTitle(char[] data) { 61 titleBuffer.append(data); 62 } 63 64 69 public String getTitle() { 70 debug("\n\nTitle: " + titleBuffer.toString()); 71 72 return titleBuffer.toString(); 73 } 74 75 80 public String getKeywords() { 81 log.debug("Keywords: " + keywordsBuffer.toString()); 82 83 return keywordsBuffer.toString(); 84 } 85 86 private StringBuffer contentsBuffer = new StringBuffer (); 87 88 protected void appendToContents(char[] data) { 89 contentsBuffer.append(data); 90 } 91 92 97 public Reader getReader() { 98 debug("\nContents: " + contentsBuffer.toString()); 99 100 return new StringReader (contentsBuffer.toString()); 101 } 102 103 private boolean indexing; 104 105 protected boolean isIndexing() { 106 return indexing; 107 } 108 109 protected void startIndexing() { 110 indexing = true; 111 } 112 113 protected void stopIndexing() { 114 indexing = false; 115 } 116 117 121 124 public void handleStartTag(Tag tag, MutableAttributeSet attributes, int pos) { 125 getStack().push(tag); 126 127 if (!contentsBuffer.toString().endsWith(" ")) { 129 contentsBuffer.append(" "); 130 } 131 132 if (tag.equals(HTML.Tag.META)) { 133 handleMetaTag(attributes); 134 } 135 136 if (tag.equals(HTML.Tag.TITLE)) { 137 handleTitleStartTag(); 138 } 139 140 if (isTagInitialized() && tag.equals(getLuceneTag())) { 141 handleLuceneStartTag(tag, attributes); 142 } 143 } 144 145 148 public void handleEndTag(Tag tag, int pos) { 149 if (!contentsBuffer.toString().endsWith(" ")) { 151 contentsBuffer.append(" "); 152 } 153 154 if (isTagInitialized() && tag.equals(getLuceneTag())) { 155 handleLuceneEndTag(); 156 } 157 158 if (tag.equals(HTML.Tag.TITLE)) { 159 handleTitleEndTag(); 160 } 161 162 try { 163 getStack().pop(); 164 } catch (TagStack.UnderflowException e) { 165 log(e); 166 } 167 } 168 169 private boolean titleParsing; 173 174 protected boolean isTitleParsing() { 175 return titleParsing; 176 } 177 178 protected void startTitleParsing() { 179 titleParsing = true; 180 } 181 182 protected void stopTitleParsing() { 183 titleParsing = false; 184 } 185 186 protected void handleTitleStartTag() { 187 startTitleParsing(); 188 } 189 190 protected void handleTitleEndTag() { 191 stopTitleParsing(); 192 } 193 194 public static final String LUCENE_TAG_NAME = "lucene-tag-name"; 198 public static final String LUCENE_CLASS_VALUE = "lucene-class-value"; 199 private HTML.Tag luceneTag = null; 200 201 204 protected void setLuceneTag(HTML.Tag tag) { 205 debug("Lucene tag: " + tag); 206 luceneTag = tag; 207 } 208 209 212 protected HTML.Tag getLuceneTag() { 213 return luceneTag; 214 } 215 216 private String luceneClassValue = null; 217 218 221 protected void setLuceneClassValue(String value) { 222 debug("Lucene class value: " + value); 223 luceneClassValue = value; 224 } 225 226 229 protected String getLuceneClassValue() { 230 return luceneClassValue; 231 } 232 233 236 protected boolean isTagInitialized() { 237 return (getLuceneTag() != null) && (getLuceneClassValue() != null); 238 } 239 240 243 protected void handleMetaTag(MutableAttributeSet attributes) { 244 Object nameObject = attributes.getAttribute(HTML.Attribute.NAME); 245 Object valueObject = attributes.getAttribute(HTML.Attribute.VALUE); 246 247 if ((nameObject != null) && (valueObject != null)) { 248 String name = (String ) nameObject; 249 log.debug("Meta tag found: name = " + name); 250 251 if (name.equals(LUCENE_TAG_NAME)) { 252 String tagName = (String ) valueObject; 253 HTML.Tag tag = HTML.getTag(tagName.toLowerCase()); 254 setLuceneTag(tag); 255 } 256 257 if (name.equals(LUCENE_CLASS_VALUE)) { 258 setLuceneClassValue((String ) valueObject); 259 } 260 } 261 262 Object contentObject = attributes.getAttribute(HTML.Attribute.CONTENT); 263 if ((nameObject != null) && (contentObject != null)) { 264 String name = (String ) nameObject; 265 log.debug("Meta tag found: name = " + name); 266 if (name.equals("keywords")) { 267 log.debug("Keywords found ..."); 268 keywordsBuffer = new StringBuffer ((String ) contentObject); 269 } 270 } 271 272 if (isTagInitialized()) { 274 stopIndexing(); 275 } 276 } 277 278 private TagStack luceneStack = new TagStack(); 282 283 protected TagStack getLuceneStack() { 284 return luceneStack; 285 } 286 287 290 protected void handleLuceneStartTag(HTML.Tag tag, MutableAttributeSet attributes) { 291 Object valueObject = attributes.getAttribute(HTML.Attribute.CLASS); 292 293 if (valueObject != null) { 294 String value = (String ) valueObject; 295 296 if (value.equals(getLuceneClassValue())) { 297 getLuceneStack().push(tag); 298 debug(""); 299 debug("---------- Starting indexing ----------"); 300 startIndexing(); 301 } 302 } 303 } 304 305 308 protected void handleLuceneEndTag() { 309 try { 310 HTML.Tag stackTag = getStack().top(); 311 312 if (!getLuceneStack().isEmpty()) { 313 HTML.Tag luceneTag = getLuceneStack().top(); 314 315 if (stackTag == luceneTag) { 316 debug(""); 317 debug("---------- Stopping indexing ----------"); 318 getLuceneStack().pop(); 319 stopIndexing(); 320 } 321 } 322 } catch (TagStack.UnderflowException e) { 323 log("Lucene index control tag not closed!", e); 324 } 325 } 326 327 330 public void handleSimpleTag(Tag tag, MutableAttributeSet attributes, int pos) { 331 handleStartTag(tag, attributes, pos); 332 handleEndTag(tag, pos); 333 } 334 335 public void handleText(char[] data, int pos) { 339 if (isDebug) { 342 System.out.println(".handleText(): data: " + new String (data)); 343 } 344 345 350 if (isIndexing() || isTitleParsing()) { 351 appendToContents(data); 352 } 353 354 if (isTitleParsing()) { 355 appendToTitle(data); 356 } 357 } 358 359 private boolean isDebug = false; 363 364 367 protected void debug(String message) { 368 if (isDebug) { 369 System.out.println(message); 370 } 371 } 372 373 376 protected void log(Exception e) { 377 log("", e); 378 } 379 380 383 protected void log(String message, Exception e) { 384 System.out.print(getClass().getName() + ": " + message + " "); 385 e.printStackTrace(System.out); 386 } 387 388 391 public class TagStack { 392 private List tags = new ArrayList (); 393 394 399 public void push(HTML.Tag tag) { 400 tags.add(0, tag); 401 } 402 403 410 public HTML.Tag pop() throws UnderflowException { 411 HTML.Tag tag = top(); 412 tags.remove(tag); 413 414 return tag; 415 } 416 417 424 public HTML.Tag top() throws UnderflowException { 425 HTML.Tag tag = null; 426 427 if (!tags.isEmpty()) { 428 tag = (HTML.Tag ) tags.get(0); 429 } else { 430 throw new UnderflowException(); 431 } 432 433 return tag; 434 } 435 436 441 public boolean isEmpty() { 442 return tags.isEmpty(); 443 } 444 445 448 public void dump() { 449 System.out.print("stack: "); 450 451 for (Iterator i = tags.iterator(); i.hasNext();) { 452 System.out.print(i.next() + ", "); 453 } 454 455 System.out.println(""); 456 } 457 458 461 public class UnderflowException extends Exception { 462 465 public UnderflowException() { 466 super("Stack underflow"); 467 } 468 } 469 } 470 } 471 | Popular Tags |