1 19 20 package org.openharmonise.rm.search; 21 22 import java.io.*; 23 import java.util.*; 24 import java.util.logging.*; 25 26 import javax.xml.transform.*; 27 import javax.xml.transform.dom.*; 28 import javax.xml.transform.stream.*; 29 30 import org.apache.lucene.analysis.*; 31 import org.apache.lucene.analysis.standard.*; 32 import org.apache.lucene.document.*; 33 import org.apache.lucene.document.Document; 34 import org.apache.lucene.index.*; 35 import org.apache.lucene.queryParser.*; 36 import org.apache.lucene.search.*; 37 import org.apache.lucene.store.*; 38 import org.openharmonise.commons.xml.*; 39 import org.openharmonise.rm.*; 40 import org.openharmonise.rm.config.*; 41 import org.openharmonise.rm.resources.*; 42 import org.openharmonise.rm.resources.content.*; 43 import org.pdfbox.pdfparser.*; 44 import org.pdfbox.pdmodel.*; 45 import org.pdfbox.util.*; 46 import EDU.oswego.cs.dl.util.concurrent.*; 47 48 49 57 public class HarmoniseIndexer { 58 59 private static HarmoniseIndexer m_instance = null; 60 private static String m_indexHome = ""; 61 private static Templates m_striptags_xsl = null; 62 private static ArrayList keywordFieldList = new ArrayList(); 63 64 private static final String FIELD_UNIQUEID = "uniqueid"; 65 private static final String FIELD_ID = "id"; 66 private static final String FIELD_NAME = "name"; 67 private static final String FIELD_DISPLAY_NAME = "display_name"; 68 private static final String FIELD_SUMMARY = "summary"; 69 private static final String FIELD_GROUP = "group"; 70 private static final String FIELD_CONTENTS = "contents"; 71 private static final String FIELD_CLASS = "class"; 72 private static final String INDEX_LOC_PROP = "INDEX_LOCATION"; 73 public static final String TAG_INDEXER = "indexer"; 74 public static final String TAG_INDEXABLE = "indexable"; 75 public static final String TAG_TEMPLATE = "template"; 76 public static final String TAG_COMPARISON = "comparison"; 77 public static final String TAG_INDEX = "index"; 78 public static final String ATTRIB_CLASSNAME = "classname"; 79 private static final String PNAME_STRIPTAGS_XSL = "STRIPTAGS"; 80 81 private Executor executor; 82 83 86 private static Logger m_logger = Logger.getLogger(HarmoniseIndexer.class.getName()); 87 88 static { 89 keywordFieldList.add(FIELD_UNIQUEID); 91 keywordFieldList.add(FIELD_ID); 92 keywordFieldList.add(FIELD_GROUP); 93 } 94 95 100 private HarmoniseIndexer() throws HarmoniseIndexerException { 101 try { 102 m_indexHome = ConfigSettings.getProperty(INDEX_LOC_PROP); 103 if ((m_indexHome == null) || (m_indexHome.length() == 0)) { 104 throw new HarmoniseIndexerException("Index location is not defined!!"); 105 } 106 executor = new QueuedExecutor(); 107 } 108 catch (Exception e) { 109 m_logger.log(Level.SEVERE, "Could not instantiate HarmoniseIndexer", e); 110 throw new HarmoniseIndexerException(e.getMessage(), e); 111 } 112 } 113 114 120 public static HarmoniseIndexer getInstance() throws HarmoniseIndexerException { 121 if (m_instance == null) { 122 m_instance = new HarmoniseIndexer(); 123 } 124 return m_instance; 125 } 126 127 public static HarmoniseIndexer getIndexer(String indexHome) throws HarmoniseIndexerException { 128 if (m_instance == null) { 129 m_instance = new HarmoniseIndexer(); 130 } 131 HarmoniseIndexer.m_indexHome = indexHome; 132 return m_instance; 133 } 134 135 142 public static boolean isIndexed(AbstractObject xobj) throws HarmoniseIndexerException { 143 boolean bExists = false; 144 145 try { 146 Directory directory = FSDirectory.getDirectory(HarmoniseIndexer.m_indexHome, false); 147 IndexReader reader = IndexReader.open(directory); 148 Term term = new Term(HarmoniseIndexer.FIELD_UNIQUEID, xobj.getClass().getName() + String.valueOf(xobj.getId())); 149 150 if (reader.docFreq(term) > 0) { 151 bExists = true; 152 } 153 reader.close(); 154 } 155 catch (FileNotFoundException e) { 156 bExists = false; 157 } 158 catch (Exception e) { 159 m_logger.log(Level.WARNING, e.getLocalizedMessage(), e); 160 throw new HarmoniseIndexerException(e.getMessage(), e); 161 } 162 163 return bExists; 164 } 165 166 172 public void indexObject(AbstractObject pObj) throws HarmoniseIndexerException { 173 if (pObj == null || (pObj instanceof AbstractObject) == false) { 174 throw new HarmoniseIndexerException("Object must be AbstractObject - " + pObj.getClass().getName()); 175 } 176 IndexRunnable indexer = new IndexRunnable(pObj); 177 try { 178 executor.execute(indexer); } 180 catch (InterruptedException e) { 181 throw new HarmoniseIndexerException("Problem running indexer asynchronously", e); 182 } 183 } 184 185 186 199 public List searchContents(Class xobjClass, Vector groupIds, String sName, String sSummary, String sContent) throws HarmoniseIndexerException { 200 return searchContents(getQuery(xobjClass, groupIds, sName, null, sSummary, sContent)); 201 } 202 203 215 public String getQuery(Class xobjClass, Vector groupIds, String sName, String sDisplayName, String sSummary, String sContent) { 216 StringBuffer sQuery = new StringBuffer (); 217 218 sQuery.append(HarmoniseIndexer.FIELD_CLASS).append(":").append(xobjClass.getName()); 219 220 if ((groupIds != null) && (groupIds.size() > 0)) { 221 sQuery.append(" AND ("); 222 223 for (int i = 0; i < groupIds.size(); i++) { 224 if (i > 0) { 225 sQuery.append(" OR "); 226 } 227 228 sQuery.append(HarmoniseIndexer.FIELD_GROUP).append(":").append(groupIds.elementAt(i)); 229 } 230 231 sQuery.append(") "); 232 } 233 234 sQuery.append(" AND ("); 235 236 boolean bOR = false; 237 238 if ((sName != null) && (sName.length() > 0)) { 240 buildFieldQueryString(sQuery, FIELD_NAME, sName); 241 bOR = true; 242 } 243 244 if(sDisplayName != null) { 245 if (bOR) { 246 sQuery.append(" OR "); 247 } 248 buildFieldQueryString(sQuery, FIELD_DISPLAY_NAME, sDisplayName); 249 250 bOR = true; 251 } 252 253 if ((sSummary != null) && (sSummary.length() > 0)) { 255 if (bOR) { 256 sQuery.append(" OR "); 257 } 258 259 buildFieldQueryString(sQuery, FIELD_SUMMARY, sSummary); 260 bOR = true; 261 } 262 263 if ((sContent != null) && (sContent.length() > 0)) { 265 if (bOR) { 266 sQuery.append(" OR "); 267 } 268 269 buildFieldQueryString(sQuery, FIELD_CONTENTS, sContent); 270 } 271 272 sQuery.append(")"); 273 274 return sQuery.toString(); 275 } 276 277 285 public List searchContents(String queryString) throws HarmoniseIndexerException { 286 Vector vec = new Vector(); 287 288 if(m_logger.getLevel() == Level.FINE) { 289 m_logger.log(Level.FINE, "Lucene query - " + queryString); 290 } 291 292 try { 293 Searcher searcher = new IndexSearcher(m_indexHome); 294 StandardAnalyzer standardAnalyzer = new StandardAnalyzer(); 295 296 PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new HarmoniseAnalyzer()); 299 300 Iterator iter = keywordFieldList.iterator(); 301 302 while(iter.hasNext()) { 303 String field = (String ) iter.next(); 304 analyzer.addAnalyzer(field,standardAnalyzer); 305 } 306 307 Query query = QueryParser.parse(queryString, FIELD_CONTENTS,analyzer); 308 309 Hits hits = searcher.search(query); 310 if(m_logger.getLevel() == Level.FINE) { 311 m_logger.log(Level.FINE, "Lucene query found " + hits.length() + " hits in the index"); 312 } 313 314 for (int i = 0; i < hits.length(); i++) { 315 vec.addElement(hits.doc(i).get(FIELD_ID)); 316 } 317 318 searcher.close(); 319 } 320 catch (Exception e) { 321 m_logger.log(Level.WARNING, e.getLocalizedMessage(), e); 322 } 323 return vec; 326 } 327 328 335 public List search(String queryString) throws HarmoniseIndexerException { 336 List hitsList = new ArrayList(); 337 Hits hits = null; 338 try { 339 Searcher searcher = new IndexSearcher(m_indexHome); 340 Query query = QueryParser.parse(queryString, FIELD_CONTENTS, new HarmoniseAnalyzer()); 341 hits = searcher.search(query); 342 344 for (int i = 0; i < hits.length(); i++) { 345 StringBuffer sb = new StringBuffer (); 346 org.apache.lucene.document.Document doc = hits.doc(i); 347 sb.append("Unique id: " + doc.get(FIELD_UNIQUEID)); 348 sb.append("\n"); 349 sb.append("Summary: " + doc.get(FIELD_SUMMARY)); 350 sb.append("\n"); 351 sb.append("Name: " + doc.get(FIELD_NAME)); 352 sb.append("\n"); 353 sb.append("Class: " + doc.get(FIELD_CLASS)); 354 hitsList.add(sb.toString()); 355 } 356 searcher.close(); 357 } 358 catch (Exception e) { 359 throw new HarmoniseIndexerException(e.getMessage(), e); 360 } 361 return hitsList; 362 } 363 364 370 public void deleteFromIndex(AbstractObject xobj) throws HarmoniseIndexerException { 371 DeleterRunnable deleter = new DeleterRunnable(xobj); 372 try { 373 executor.execute(deleter); 374 } 375 catch (InterruptedException e) { 376 throw new HarmoniseIndexerException("Problem running delete asynchronously", e); 377 } 378 } 379 380 381 389 private void buildFieldQueryString(StringBuffer queryBuf, String fieldName, String inputString) { 390 StringTokenizer tokeniser = new StringTokenizer(inputString); 392 393 boolean buildingPhrase = false; 397 while (tokeniser.hasMoreTokens()) { 398 String token = tokeniser.nextToken(); 399 token = token.replaceAll("\"\"", "\""); 400 if (token.equals("AND") || token.equals("OR") || token.equals("NOT")) { 401 queryBuf.append(token + " "); } 403 else { 404 if (buildingPhrase == true) { 406 queryBuf.append(token + " "); if (token.endsWith("\"")) { 409 buildingPhrase = false; 410 } 411 continue; 412 } 413 queryBuf.append(fieldName + ":" + token + " "); 414 if (token.startsWith("\"")) { 416 buildingPhrase = true; 417 } 418 } 419 } 420 } 421 422 428 private class IndexRunnable implements Runnable { 429 430 private AbstractObject obj; 431 private String contents; 432 433 public IndexRunnable(AbstractObject obj) { 434 this.obj = obj; 435 } 436 437 440 public void run() { 441 contents = getContents(); 442 Document doc = new Document(); 444 String classname = obj.getClass().getName(); 445 446 doc.add(Field.Keyword(FIELD_UNIQUEID, classname + String.valueOf(obj.getId()))); 447 doc.add(Field.UnIndexed(FIELD_ID, String.valueOf(obj.getId()))); 448 449 try { 450 AbstractParentObject grp = ((AbstractChildObject) obj).getRealParent(); 451 if (grp != null) { 452 doc.add(Field.Keyword(FIELD_GROUP, String.valueOf(grp.getId()))); 453 } 454 455 doc.add(Field.Text(FIELD_CLASS, classname)); 456 doc.add(Field.Text(FIELD_NAME, obj.getName())); 457 if (obj.getSummary() != null) { 458 doc.add(Field.Text(FIELD_SUMMARY, obj.getSummary())); 459 } 460 461 if(obj instanceof AbstractEditableObject) { 462 AbstractEditableObject edObj = (AbstractEditableObject) obj; 463 String sDispName = edObj.getDisplayName(); 464 465 if (sDispName != null) { 466 doc.add(Field.Text(FIELD_DISPLAY_NAME, sDispName)); 467 } 468 } 469 470 if (contents != null) { 471 doc.add(Field.Text(FIELD_CONTENTS, new StringReader(contents))); 472 } 473 474 475 if (HarmoniseIndexer.isIndexed(obj) == true) { 476 Directory directory = FSDirectory.getDirectory(HarmoniseIndexer.m_indexHome, false); 477 478 if (IndexReader.indexExists(directory)) { 479 IndexReader reader = IndexReader.open(directory); 480 Term term = new Term(HarmoniseIndexer.FIELD_UNIQUEID, obj.getClass().getName() + String.valueOf(obj.getId())); 481 reader.delete(term); 482 reader.close(); 483 } 484 } 485 IndexWriter writer = null; 486 try { 487 writer = new IndexWriter(HarmoniseIndexer.m_indexHome, new HarmoniseAnalyzer(), false); 488 } catch (FileNotFoundException e) { 489 writer = new IndexWriter(HarmoniseIndexer.m_indexHome, new HarmoniseAnalyzer(), true); 490 } 491 492 writer.addDocument(doc); 493 writer.optimize(); 494 writer.close(); 495 HarmoniseIndexer.m_logger.log(Level.INFO, "indexed " + obj.getType() + ", ID: " + obj.getId()); 496 } 497 catch (DataAccessException e) { 498 HarmoniseIndexer.m_logger.log(Level.WARNING, "Data Access Exception", e); 499 } 500 catch (IOException e) { 501 HarmoniseIndexer.m_logger.log(Level.WARNING, "IOException", e); 502 } 503 catch (HarmoniseIndexerException e) { 504 HarmoniseIndexer.m_logger.log(Level.WARNING, "Harmonise Indexer Exception", e); 505 } 506 } 507 508 private String getContents() { 509 510 String objContents = null; 511 512 try { 513 if (obj instanceof org.openharmonise.rm.resources.content.Document) { 514 org.openharmonise.rm.resources.content.Document doc = (org.openharmonise.rm.resources.content.Document) obj; 515 org.w3c.dom.Document xmlcontent = XMLDocument.getXMLDocumentFromString(doc.getContent()); 516 objContents = getStringFromXML(xmlcontent); 517 } 518 else if (obj instanceof Asset) { 519 Asset asset = (Asset) obj; 520 if (asset.getContentType().equalsIgnoreCase("application/pdf")) { 521 objContents = getStringFromPDF(asset.getContentFile()); 522 } 523 } 524 } 525 catch (Exception e) { 526 HarmoniseIndexer.m_logger.log(Level.WARNING, "Exception", e); 527 } 528 return objContents; 529 } 530 531 538 private String getStringFromXML(org.w3c.dom.Document xml) throws HarmoniseIndexerException { 539 540 String sResult = ""; 541 try { 542 if(HarmoniseIndexer.m_striptags_xsl == null) { 543 String stripFileName = ConfigSettings.getProperty(PNAME_STRIPTAGS_XSL); 545 546 if(stripFileName != null && stripFileName.length()>0) { 547 StreamSource ssource = new StreamSource(new File(stripFileName)); 548 HarmoniseIndexer.m_striptags_xsl = (Templates) org.apache.xalan.xsltc.trax.TransformerFactoryImpl.newInstance().newTemplates(ssource); 549 } 550 } 551 if(m_striptags_xsl != null) { 553 Transformer trans = HarmoniseIndexer.m_striptags_xsl.newTransformer(); 554 DOMSource ds = new DOMSource(xml.getDocumentElement()); 555 StringWriter sw = new StringWriter(); 556 StreamResult res = new StreamResult(sw); 557 trans.transform(ds, res); 558 sResult = sw.toString(); 559 sw.close(); 560 } 561 } catch (ConfigException e) { 562 throw new HarmoniseIndexerException("Config error", e); 563 } catch (TransformerConfigurationException e) { 564 throw new HarmoniseIndexerException("Transformer Configuration Exception", e); 565 } catch (TransformerFactoryConfigurationError e) { 566 throw new HarmoniseIndexerException("Transformer Factory Configuration error", e); 567 } catch (TransformerException e) { 568 throw new HarmoniseIndexerException("Transformer error", e); 569 } catch (IOException e) { 570 throw new HarmoniseIndexerException("IO error", e); 571 } 572 573 return sResult; 574 } 575 576 583 private String getStringFromPDF(File pdfFile) throws HarmoniseIndexerException { 584 String sText = ""; 585 586 try { 587 FileInputStream pdfStream = new FileInputStream(pdfFile); 588 PDFParser pdfParser = new PDFParser(pdfStream); 589 pdfParser.parse(); 590 PDDocument pdf = pdfParser.getPDDocument(); 591 PDFTextStripper textstripper = new PDFTextStripper(); 592 sText = textstripper.getText(pdf); 593 HarmoniseIndexer.m_logger.log(Level.FINEST, sText); 594 pdf.close(); 595 } catch (FileNotFoundException e) { 596 throw new HarmoniseIndexerException("File not found", e); 597 } catch (IOException e) { 598 throw new HarmoniseIndexerException("IO exception", e); 599 } 600 601 return sText; 602 } 603 } 604 605 610 private class DeleterRunnable implements Runnable { 611 612 private AbstractObject obj; 613 614 617 public DeleterRunnable(AbstractObject obj) { 618 if (obj == null) { 619 throw new NullPointerException ("obj cannot be null"); 620 } 621 this.obj = obj; 622 } 623 624 627 public void run() { 628 try { 629 Directory directory = FSDirectory.getDirectory(HarmoniseIndexer.m_indexHome, false); 630 631 if (IndexReader.indexExists(directory)) { 632 IndexReader reader = IndexReader.open(directory); 633 Term term = new Term(HarmoniseIndexer.FIELD_UNIQUEID, obj.getClass().getName() + String.valueOf(obj.getId())); 634 reader.delete(term); 635 reader.close(); 636 HarmoniseIndexer.m_logger.log(Level.FINE, "deleted " + obj.getType() + ", ID: " + obj.getId() + " from index"); 637 } 638 } 639 catch (Exception e) { 640 HarmoniseIndexer.m_logger.log(Level.WARNING, "problem deleting object", e); 641 } 642 } 643 } 644 645 } | Popular Tags |