ConfigurableDocumentCreator


1   /*
2    * Copyright  1999-2004 The Apache Software Foundation
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *
16   */
17  
18  /* $Id: ConfigurableDocumentCreator.java 43020 2004-05-16 23:23:13Z michi $  */
19  
20  package org.apache.lenya.lucene.index;
21  
22  import java.io.File  ;
23  import java.io.FileWriter  ;
24  import java.io.IOException  ;
25  import java.io.Reader  ;
26  import java.io.StringReader  ;
27  import java.io.StringWriter  ;
28  import java.io.Writer  ;
29  import java.lang.reflect.Method  ;
30  
31  import javax.xml.parsers.DocumentBuilder  ;
32  import javax.xml.parsers.DocumentBuilderFactory  ;
33  import javax.xml.transform.OutputKeys  ;
34  import javax.xml.transform.Transformer  ;
35  import javax.xml.transform.TransformerFactory  ;
36  import javax.xml.transform.dom.DOMSource  ;
37  import javax.xml.transform.stream.StreamResult  ;
38  import javax.xml.transform.stream.StreamSource  ;
39  
40  import org.apache.lenya.lucene.parser.HTMLParser;
41  import org.apache.lenya.lucene.parser.HTMLParserFactory;
42  import org.apache.lenya.lucene.parser.StringCleaner;
43  import org.apache.lenya.xml.DocumentHelper;
44  import org.apache.lenya.xml.NamespaceHelper;
45  import org.apache.log4j.Category;
46  import org.apache.lucene.document.Document;
47  import org.apache.lucene.document.Field;
48  import org.w3c.dom.Element  ;
49  import org.w3c.dom.Node  ;
50  import org.w3c.dom.NodeList  ;
51  import org.xml.sax.InputSource  ;
52  
53  /**
54   * Uses XSLT to transform a XML into a Lucene document
55   */
56  public class ConfigurableDocumentCreator extends AbstractDocumentCreator {
57      Category log = Category.getInstance(ConfigurableDocumentCreator.class);
58    
59      public static final String   LUCENE_NAMESPACE = "http://apache.org/cocoon/lenya/lucene/1.0";
60      public static final String   XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
61  
62      /**
63       * Creates a new ConfigurableDocumentCreator object.
64       *
65       * @param stylesheet DOCUMENT ME!
66       */
67      public ConfigurableDocumentCreator(String   stylesheet) {
68          this.stylesheet = stylesheet;
69      }
70  
71      private String   stylesheet;
72  
73      /**
74       * DOCUMENT ME!
75       *
76       * @return DOCUMENT ME!
77       */
78      public String   getStylesheet() {
79          return stylesheet;
80      }
81  
82      /**
83       * Transform source document into lucene document and generate a Lucene Document instance
84       *
85       * @param file DOCUMENT ME!
86       * @param htdocsDumpDir DOCUMENT ME!
87       *
88       * @return DOCUMENT ME!
89       *
90       * @throws Exception DOCUMENT ME!
91       */
92      public Document getDocument(File   file, File   htdocsDumpDir) throws Exception   {
93          log.debug(".getDocument() : indexing " + file.getAbsolutePath());
94          try {
95  
96              org.w3c.dom.Document   sourceDocument = null;
97              DocumentBuilderFactory   parserFactory = DocumentBuilderFactory.newInstance();
98              parserFactory.setValidating(false);
99              parserFactory.setNamespaceAware(true);
100             parserFactory.setIgnoringElementContentWhitespace(true);
101             DocumentBuilder   mybuilder = parserFactory.newDocumentBuilder();
102             sourceDocument = mybuilder.parse(file.getAbsolutePath());
103 
104 
105 // FIXME: What is this good for: <?xml version="1.0"?><body>...</body>
106 /*
107             NamespaceHelper documentHelper = new NamespaceHelper(XHTML_NAMESPACE, "xhtml", "html");
108             org.w3c.dom.Document sourceDocument = documentHelper.getDocument();
109 
110             Element rootNode = sourceDocument.getDocumentElement();
111 
112             String bodyText = getBodyText(file);
113             Element bodyElement = documentHelper.createElement("body", bodyText);
114             rootNode.appendChild(bodyElement);
115 */
116 
117 
118 
119 
120             DOMSource   documentSource = new DOMSource  (sourceDocument);
121             Writer   documentWriter = new StringWriter  ();
122 
123             TransformerFactory   tFactory = TransformerFactory.newInstance();
124             Transformer   documentTransformer = tFactory.newTransformer(new StreamSource  (new StringReader  (getStylesheet())));
125             documentTransformer.setOutputProperty(OutputKeys.INDENT, "yes");
126             documentTransformer.setOutputProperty(OutputKeys.ENCODING, "ISO-8859-1");
127 
128             String   fileName = file.getName();
129 
130             if (fileName.endsWith(".pdf.txt")) {
131                 fileName = fileName.substring(0, fileName.lastIndexOf(".txt"));
132             }
133 
134             documentTransformer.setParameter("filename", fileName);
135             documentTransformer.transform(documentSource, new StreamResult  (documentWriter));
136 
137             // DEBUG: debug lucene documents
138             //dumpLuceneDocument(file, documentWriter);
139 
140             DocumentBuilder   builder = DocumentHelper.createBuilder();
141             org.w3c.dom.Document   luceneDocument = builder.parse(new InputSource  (new StringReader  (documentWriter.toString())));
142 
143             NamespaceHelper helper = new NamespaceHelper(LUCENE_NAMESPACE, "luc", luceneDocument);
144             Element   root = luceneDocument.getDocumentElement();
145             Element  [] fieldElements = helper.getChildren(root, "field");
146 
147             Document document = super.getDocument(file, htdocsDumpDir);
148 
149             Class  [] parameterTypes = { String  .class, String  .class };
150 
151             for (int i = 0; i < fieldElements.length; i++) {
152                 String   name = fieldElements[i].getAttribute("name");
153                 String   type = fieldElements[i].getAttribute("type");
154                 String   text = getText(fieldElements[i]);
155 
156                 Method   method = Field.class.getMethod(type, parameterTypes);
157 
158                 String  [] args = { name, text };
159 
160                 Field field = (Field) method.invoke(null, args);
161                 document.add(field);
162 
163             }
164 
165             return document;
166         } catch (Exception   e) {
167             throw e;
168         }
169     }
170 
171     /**
172      * Writes the lucene XML document to a file.
173      */
174     protected void dumpLuceneDocument(File   file, Writer   writer) throws IOException   {
175         log.debug(".dumpLuceneDocument(): Dump document: " + file.getAbsolutePath());
176 
177         File   luceneDocumentFile = new File  (file.getAbsolutePath() + ".xluc");
178         luceneDocumentFile.createNewFile();
179 
180         FileWriter   fileWriter = new FileWriter  (luceneDocumentFile);
181         fileWriter.write(writer.toString());
182         fileWriter.close();
183     }
184 
185     /**
186      * DOCUMENT ME!
187      *
188      * @param node DOCUMENT ME!
189      *
190      * @return DOCUMENT ME!
191      */
192     public static String   getText(Node   node) {
193         StringBuffer   result = new StringBuffer  ();
194 
195         if (!node.hasChildNodes()) {
196             return "";
197         }
198 
199         NodeList   list = node.getChildNodes();
200 
201         for (int i = 0; i < list.getLength(); i++) {
202             Node   subnode = list.item(i);
203 
204             if (subnode.getNodeType() == Node.TEXT_NODE) {
205                 result.append(subnode.getNodeValue());
206             } else if (subnode.getNodeType() == Node.CDATA_SECTION_NODE) {
207                 result.append(subnode.getNodeValue());
208             } else if (subnode.getNodeType() == Node.ENTITY_REFERENCE_NODE) {
209                 // Recurse into the subtree for text
210                 // (and ignore comments)
211                 result.append(getText(subnode));
212             }
213         }
214 
215         return result.toString();
216     }
217 
218     /**
219      * DOCUMENT ME!
220      *
221      * @param file DOCUMENT ME!
222      *
223      * @return DOCUMENT ME!
224      *
225      * @throws Exception DOCUMENT ME!
226      */
227     public static String   getBodyText(File   file) throws Exception   {
228         HTMLParser parser = HTMLParserFactory.newInstance(file);
229         parser.parse(file);
230 
231         Reader   reader = parser.getReader();
232         Writer   writer = new StringWriter  ();
233 
234         int c;
235 
236         while ((c = reader.read()) != -1)
237             writer.write(c);
238 
239         String   content = writer.toString();
240         reader.close();
241         writer.close();
242 
243         content = StringCleaner.clean(content);
244 
245         return content;
246     }
247 }
248
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags