KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lenya > lucene > index > ConfigurableDocumentCreator


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */

17
18 /* $Id: ConfigurableDocumentCreator.java 43020 2004-05-16 23:23:13Z michi $ */
19
20 package org.apache.lenya.lucene.index;
21
22 import java.io.File JavaDoc;
23 import java.io.FileWriter JavaDoc;
24 import java.io.IOException JavaDoc;
25 import java.io.Reader JavaDoc;
26 import java.io.StringReader JavaDoc;
27 import java.io.StringWriter JavaDoc;
28 import java.io.Writer JavaDoc;
29 import java.lang.reflect.Method JavaDoc;
30
31 import javax.xml.parsers.DocumentBuilder JavaDoc;
32 import javax.xml.parsers.DocumentBuilderFactory JavaDoc;
33 import javax.xml.transform.OutputKeys JavaDoc;
34 import javax.xml.transform.Transformer JavaDoc;
35 import javax.xml.transform.TransformerFactory JavaDoc;
36 import javax.xml.transform.dom.DOMSource JavaDoc;
37 import javax.xml.transform.stream.StreamResult JavaDoc;
38 import javax.xml.transform.stream.StreamSource JavaDoc;
39
40 import org.apache.lenya.lucene.parser.HTMLParser;
41 import org.apache.lenya.lucene.parser.HTMLParserFactory;
42 import org.apache.lenya.lucene.parser.StringCleaner;
43 import org.apache.lenya.xml.DocumentHelper;
44 import org.apache.lenya.xml.NamespaceHelper;
45 import org.apache.log4j.Category;
46 import org.apache.lucene.document.Document;
47 import org.apache.lucene.document.Field;
48 import org.w3c.dom.Element JavaDoc;
49 import org.w3c.dom.Node JavaDoc;
50 import org.w3c.dom.NodeList JavaDoc;
51 import org.xml.sax.InputSource JavaDoc;
52
53 /**
54  * Uses XSLT to transform a XML into a Lucene document
55  */

56 public class ConfigurableDocumentCreator extends AbstractDocumentCreator {
57     Category log = Category.getInstance(ConfigurableDocumentCreator.class);
58   
59     public static final String JavaDoc LUCENE_NAMESPACE = "http://apache.org/cocoon/lenya/lucene/1.0";
60     public static final String JavaDoc XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
61
62     /**
63      * Creates a new ConfigurableDocumentCreator object.
64      *
65      * @param stylesheet DOCUMENT ME!
66      */

67     public ConfigurableDocumentCreator(String JavaDoc stylesheet) {
68         this.stylesheet = stylesheet;
69     }
70
71     private String JavaDoc stylesheet;
72
73     /**
74      * DOCUMENT ME!
75      *
76      * @return DOCUMENT ME!
77      */

78     public String JavaDoc getStylesheet() {
79         return stylesheet;
80     }
81
82     /**
83      * Transform source document into lucene document and generate a Lucene Document instance
84      *
85      * @param file DOCUMENT ME!
86      * @param htdocsDumpDir DOCUMENT ME!
87      *
88      * @return DOCUMENT ME!
89      *
90      * @throws Exception DOCUMENT ME!
91      */

92     public Document getDocument(File JavaDoc file, File JavaDoc htdocsDumpDir) throws Exception JavaDoc {
93         log.debug(".getDocument() : indexing " + file.getAbsolutePath());
94         try {
95
96             org.w3c.dom.Document JavaDoc sourceDocument = null;
97             DocumentBuilderFactory JavaDoc parserFactory = DocumentBuilderFactory.newInstance();
98             parserFactory.setValidating(false);
99             parserFactory.setNamespaceAware(true);
100             parserFactory.setIgnoringElementContentWhitespace(true);
101             DocumentBuilder JavaDoc mybuilder = parserFactory.newDocumentBuilder();
102             sourceDocument = mybuilder.parse(file.getAbsolutePath());
103
104
105 // FIXME: What is this good for: <?xml version="1.0"?><body>...</body>
106
/*
107             NamespaceHelper documentHelper = new NamespaceHelper(XHTML_NAMESPACE, "xhtml", "html");
108             org.w3c.dom.Document sourceDocument = documentHelper.getDocument();
109
110             Element rootNode = sourceDocument.getDocumentElement();
111
112             String bodyText = getBodyText(file);
113             Element bodyElement = documentHelper.createElement("body", bodyText);
114             rootNode.appendChild(bodyElement);
115 */

116
117
118
119
120             DOMSource JavaDoc documentSource = new DOMSource JavaDoc(sourceDocument);
121             Writer JavaDoc documentWriter = new StringWriter JavaDoc();
122
123             TransformerFactory JavaDoc tFactory = TransformerFactory.newInstance();
124             Transformer JavaDoc documentTransformer = tFactory.newTransformer(new StreamSource JavaDoc(new StringReader JavaDoc(getStylesheet())));
125             documentTransformer.setOutputProperty(OutputKeys.INDENT, "yes");
126             documentTransformer.setOutputProperty(OutputKeys.ENCODING, "ISO-8859-1");
127
128             String JavaDoc fileName = file.getName();
129
130             if (fileName.endsWith(".pdf.txt")) {
131                 fileName = fileName.substring(0, fileName.lastIndexOf(".txt"));
132             }
133
134             documentTransformer.setParameter("filename", fileName);
135             documentTransformer.transform(documentSource, new StreamResult JavaDoc(documentWriter));
136
137             // DEBUG: debug lucene documents
138
//dumpLuceneDocument(file, documentWriter);
139

140             DocumentBuilder JavaDoc builder = DocumentHelper.createBuilder();
141             org.w3c.dom.Document JavaDoc luceneDocument = builder.parse(new InputSource JavaDoc(new StringReader JavaDoc(documentWriter.toString())));
142
143             NamespaceHelper helper = new NamespaceHelper(LUCENE_NAMESPACE, "luc", luceneDocument);
144             Element JavaDoc root = luceneDocument.getDocumentElement();
145             Element JavaDoc[] fieldElements = helper.getChildren(root, "field");
146
147             Document document = super.getDocument(file, htdocsDumpDir);
148
149             Class JavaDoc[] parameterTypes = { String JavaDoc.class, String JavaDoc.class };
150
151             for (int i = 0; i < fieldElements.length; i++) {
152                 String JavaDoc name = fieldElements[i].getAttribute("name");
153                 String JavaDoc type = fieldElements[i].getAttribute("type");
154                 String JavaDoc text = getText(fieldElements[i]);
155
156                 Method JavaDoc method = Field.class.getMethod(type, parameterTypes);
157
158                 String JavaDoc[] args = { name, text };
159
160                 Field field = (Field) method.invoke(null, args);
161                 document.add(field);
162
163             }
164
165             return document;
166         } catch (Exception JavaDoc e) {
167             throw e;
168         }
169     }
170
171     /**
172      * Writes the lucene XML document to a file.
173      */

174     protected void dumpLuceneDocument(File JavaDoc file, Writer JavaDoc writer) throws IOException JavaDoc {
175         log.debug(".dumpLuceneDocument(): Dump document: " + file.getAbsolutePath());
176
177         File JavaDoc luceneDocumentFile = new File JavaDoc(file.getAbsolutePath() + ".xluc");
178         luceneDocumentFile.createNewFile();
179
180         FileWriter JavaDoc fileWriter = new FileWriter JavaDoc(luceneDocumentFile);
181         fileWriter.write(writer.toString());
182         fileWriter.close();
183     }
184
185     /**
186      * DOCUMENT ME!
187      *
188      * @param node DOCUMENT ME!
189      *
190      * @return DOCUMENT ME!
191      */

192     public static String JavaDoc getText(Node JavaDoc node) {
193         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
194
195         if (!node.hasChildNodes()) {
196             return "";
197         }
198
199         NodeList JavaDoc list = node.getChildNodes();
200
201         for (int i = 0; i < list.getLength(); i++) {
202             Node JavaDoc subnode = list.item(i);
203
204             if (subnode.getNodeType() == Node.TEXT_NODE) {
205                 result.append(subnode.getNodeValue());
206             } else if (subnode.getNodeType() == Node.CDATA_SECTION_NODE) {
207                 result.append(subnode.getNodeValue());
208             } else if (subnode.getNodeType() == Node.ENTITY_REFERENCE_NODE) {
209                 // Recurse into the subtree for text
210
// (and ignore comments)
211
result.append(getText(subnode));
212             }
213         }
214
215         return result.toString();
216     }
217
218     /**
219      * DOCUMENT ME!
220      *
221      * @param file DOCUMENT ME!
222      *
223      * @return DOCUMENT ME!
224      *
225      * @throws Exception DOCUMENT ME!
226      */

227     public static String JavaDoc getBodyText(File JavaDoc file) throws Exception JavaDoc {
228         HTMLParser parser = HTMLParserFactory.newInstance(file);
229         parser.parse(file);
230
231         Reader JavaDoc reader = parser.getReader();
232         Writer JavaDoc writer = new StringWriter JavaDoc();
233
234         int c;
235
236         while ((c = reader.read()) != -1)
237             writer.write(c);
238
239         String JavaDoc content = writer.toString();
240         reader.close();
241         writer.close();
242
243         content = StringCleaner.clean(content);
244
245         return content;
246     }
247 }
248
Popular Tags