|                                                                                                              1
 17
 18
 19
 20  package org.apache.lenya.lucene.index;
 21
 22  import java.io.File
  ; 23  import java.io.FileWriter
  ; 24  import java.io.IOException
  ; 25  import java.io.Reader
  ; 26  import java.io.StringReader
  ; 27  import java.io.StringWriter
  ; 28  import java.io.Writer
  ; 29  import java.lang.reflect.Method
  ; 30
 31  import javax.xml.parsers.DocumentBuilder
  ; 32  import javax.xml.parsers.DocumentBuilderFactory
  ; 33  import javax.xml.transform.OutputKeys
  ; 34  import javax.xml.transform.Transformer
  ; 35  import javax.xml.transform.TransformerFactory
  ; 36  import javax.xml.transform.dom.DOMSource
  ; 37  import javax.xml.transform.stream.StreamResult
  ; 38  import javax.xml.transform.stream.StreamSource
  ; 39
 40  import org.apache.lenya.lucene.parser.HTMLParser;
 41  import org.apache.lenya.lucene.parser.HTMLParserFactory;
 42  import org.apache.lenya.lucene.parser.StringCleaner;
 43  import org.apache.lenya.xml.DocumentHelper;
 44  import org.apache.lenya.xml.NamespaceHelper;
 45  import org.apache.log4j.Category;
 46  import org.apache.lucene.document.Document;
 47  import org.apache.lucene.document.Field;
 48  import org.w3c.dom.Element
  ; 49  import org.w3c.dom.Node
  ; 50  import org.w3c.dom.NodeList
  ; 51  import org.xml.sax.InputSource
  ; 52
 53
 56  public class ConfigurableDocumentCreator extends AbstractDocumentCreator {
 57      Category log = Category.getInstance(ConfigurableDocumentCreator.class);
 58
 59      public static final String
  LUCENE_NAMESPACE = "http://apache.org/cocoon/lenya/lucene/1.0"; 60      public static final String
  XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"; 61
 62
 67      public ConfigurableDocumentCreator(String
  stylesheet) { 68          this.stylesheet = stylesheet;
 69      }
 70
 71      private String
  stylesheet; 72
 73
 78      public String
  getStylesheet() { 79          return stylesheet;
 80      }
 81
 82
 92      public Document getDocument(File
  file, File  htdocsDumpDir) throws Exception  { 93          log.debug(".getDocument() : indexing " + file.getAbsolutePath());
 94          try {
 95
 96              org.w3c.dom.Document
  sourceDocument = null; 97              DocumentBuilderFactory
  parserFactory = DocumentBuilderFactory.newInstance(); 98              parserFactory.setValidating(false);
 99              parserFactory.setNamespaceAware(true);
 100             parserFactory.setIgnoringElementContentWhitespace(true);
 101             DocumentBuilder
  mybuilder = parserFactory.newDocumentBuilder(); 102             sourceDocument = mybuilder.parse(file.getAbsolutePath());
 103
 104
 105
 116
 117
 118
 119
 120             DOMSource
  documentSource = new DOMSource  (sourceDocument); 121             Writer
  documentWriter = new StringWriter  (); 122
 123             TransformerFactory
  tFactory = TransformerFactory.newInstance(); 124             Transformer
  documentTransformer = tFactory.newTransformer(new StreamSource  (new StringReader  (getStylesheet()))); 125             documentTransformer.setOutputProperty(OutputKeys.INDENT, "yes");
 126             documentTransformer.setOutputProperty(OutputKeys.ENCODING, "ISO-8859-1");
 127
 128             String
  fileName = file.getName(); 129
 130             if (fileName.endsWith(".pdf.txt")) {
 131                 fileName = fileName.substring(0, fileName.lastIndexOf(".txt"));
 132             }
 133
 134             documentTransformer.setParameter("filename", fileName);
 135             documentTransformer.transform(documentSource, new StreamResult
  (documentWriter)); 136
 137
 140             DocumentBuilder
  builder = DocumentHelper.createBuilder(); 141             org.w3c.dom.Document
  luceneDocument = builder.parse(new InputSource  (new StringReader  (documentWriter.toString()))); 142
 143             NamespaceHelper helper = new NamespaceHelper(LUCENE_NAMESPACE, "luc", luceneDocument);
 144             Element
  root = luceneDocument.getDocumentElement(); 145             Element
  [] fieldElements = helper.getChildren(root, "field"); 146
 147             Document document = super.getDocument(file, htdocsDumpDir);
 148
 149             Class
  [] parameterTypes = { String  .class, String  .class }; 150
 151             for (int i = 0; i < fieldElements.length; i++) {
 152                 String
  name = fieldElements[i].getAttribute("name"); 153                 String
  type = fieldElements[i].getAttribute("type"); 154                 String
  text = getText(fieldElements[i]); 155
 156                 Method
  method = Field.class.getMethod(type, parameterTypes); 157
 158                 String
  [] args = { name, text }; 159
 160                 Field field = (Field) method.invoke(null, args);
 161                 document.add(field);
 162
 163             }
 164
 165             return document;
 166         } catch (Exception
  e) { 167             throw e;
 168         }
 169     }
 170
 171
 174     protected void dumpLuceneDocument(File
  file, Writer  writer) throws IOException  { 175         log.debug(".dumpLuceneDocument(): Dump document: " + file.getAbsolutePath());
 176
 177         File
  luceneDocumentFile = new File  (file.getAbsolutePath() + ".xluc"); 178         luceneDocumentFile.createNewFile();
 179
 180         FileWriter
  fileWriter = new FileWriter  (luceneDocumentFile); 181         fileWriter.write(writer.toString());
 182         fileWriter.close();
 183     }
 184
 185
 192     public static String
  getText(Node  node) { 193         StringBuffer
  result = new StringBuffer  (); 194
 195         if (!node.hasChildNodes()) {
 196             return "";
 197         }
 198
 199         NodeList
  list = node.getChildNodes(); 200
 201         for (int i = 0; i < list.getLength(); i++) {
 202             Node
  subnode = list.item(i); 203
 204             if (subnode.getNodeType() == Node.TEXT_NODE) {
 205                 result.append(subnode.getNodeValue());
 206             } else if (subnode.getNodeType() == Node.CDATA_SECTION_NODE) {
 207                 result.append(subnode.getNodeValue());
 208             } else if (subnode.getNodeType() == Node.ENTITY_REFERENCE_NODE) {
 209                                                 result.append(getText(subnode));
 212             }
 213         }
 214
 215         return result.toString();
 216     }
 217
 218
 227     public static String
  getBodyText(File  file) throws Exception  { 228         HTMLParser parser = HTMLParserFactory.newInstance(file);
 229         parser.parse(file);
 230
 231         Reader
  reader = parser.getReader(); 232         Writer
  writer = new StringWriter  (); 233
 234         int c;
 235
 236         while ((c = reader.read()) != -1)
 237             writer.write(c);
 238
 239         String
  content = writer.toString(); 240         reader.close();
 241         writer.close();
 242
 243         content = StringCleaner.clean(content);
 244
 245         return content;
 246     }
 247 }
 248
                                                                                                                                                                                                             |                                                                       
 
 
 
 
 
                                                                                   Popular Tags                                                                                                                                                                                              |