KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > cocoon > components > search > SimpleLuceneXMLIndexerImpl


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.apache.cocoon.components.search;
17
18 import org.apache.avalon.framework.configuration.Configurable;
19 import org.apache.avalon.framework.configuration.Configuration;
20 import org.apache.avalon.framework.configuration.ConfigurationException;
21 import org.apache.avalon.framework.logger.AbstractLogEnabled;
22 import org.apache.avalon.framework.service.ServiceException;
23 import org.apache.avalon.framework.service.ServiceManager;
24 import org.apache.avalon.framework.service.Serviceable;
25 import org.apache.avalon.framework.thread.ThreadSafe;
26 import org.apache.cocoon.ProcessingException;
27 import org.apache.commons.lang.StringUtils;
28 import org.apache.excalibur.xml.sax.SAXParser;
29 import org.apache.lucene.document.DateField;
30 import org.apache.lucene.document.Document;
31 import org.apache.lucene.document.Field;
32 import org.xml.sax.InputSource JavaDoc;
33 import org.xml.sax.SAXException JavaDoc;
34
35 import java.io.IOException JavaDoc;
36 import java.io.InputStream JavaDoc;
37 import java.net.URL JavaDoc;
38 import java.net.URLConnection JavaDoc;
39 import java.util.Collections JavaDoc;
40 import java.util.HashSet JavaDoc;
41 import java.util.Iterator JavaDoc;
42 import java.util.List JavaDoc;
43
44
45 /**
46  * A simple class building lucene documents from xml content.
47  *
48  * <p>It has two parameters that effect the way it works:</p>
49  * <p>
50  * <tt>&lt;store-fields/&gt;</tt>
51  * Sets which tags in your content are stored in Lucene as fields,
52  * during the indexing process. Allows them to be output with search hits.
53  * </p><p>
54  * <tt>&lt;content-view-query/&gt;</tt>
55  * Sets the view the indexer will request for indexing content.
56  * </p><p>
57  * Example configuration (goes in cocoon.xconf)
58  * <pre><tt>
59  * &lt;lucene-xml-indexer logger="core.search.lucene"&gt;
60  * &lt;store-fields&gt;title, summary&lt;/store-fields&gt;
61  * &lt;content-view-query&gt;cocoon-view=search&lt;/content-view-query&gt;
62  * &lt;/lucene-xml-indexer&gt;
63  * </tt></pre></p>
64  *
65  * @author <a HREF="mailto:berni_huber@a1.net">Bernhard Huber</a>
66  * @author <a HREF="mailto:jeremy@apache.org">Jeremy Quinn</a>
67  * @version CVS $Id: SimpleLuceneXMLIndexerImpl.java 123810 2004-12-31 17:07:31Z antonio $
68  */

69 public class SimpleLuceneXMLIndexerImpl extends AbstractLogEnabled
70          implements LuceneXMLIndexer, Configurable, Serviceable, ThreadSafe {
71
72     /**
73      * The service manager instance
74      *
75      * @since
76      */

77     protected ServiceManager manager = null;
78
79     /**
80      * Config element name specifying query-string appendend for requesting links
81      * of an URL.
82      * <p>
83      * Its value is <code>link-view-query</code>.
84      * </p>
85      *
86      * @since
87      */

88     public final static String JavaDoc CONTENT_VIEW_QUERY_CONFIG = "content-view-query";
89
90     /**
91      * append this string to the url in order to get the
92      * content view of the url
93      *
94      * @since
95      */

96     
97     final static String JavaDoc CONTENT_VIEW_QUERY_DEFAULT = "cocoon-view=content";
98
99     /**
100      * Config element name specifying the tags to be added as Stored, Untokenised, Unindexed Fields.
101      * <p>
102      * Its value is <code>field-tags</code>.
103      * </p>
104      *
105      * @since
106      */

107     public final static String JavaDoc FIELDTAGS_CONFIG = "store-fields";
108
109     /**
110      * set of allowed content types
111      *
112      * @since
113      */

114     final HashSet JavaDoc allowedContentType;
115
116
117     /**
118      * @since
119      */

120     public SimpleLuceneXMLIndexerImpl() {
121         allowedContentType = new HashSet JavaDoc();
122         allowedContentType.add("text/xml");
123         allowedContentType.add("text/xhtml");
124         fieldTags = new HashSet JavaDoc();
125     }
126     
127     
128     private String JavaDoc contentViewQuery = CONTENT_VIEW_QUERY_DEFAULT;
129     private HashSet JavaDoc fieldTags;
130
131
132     /**
133      * configure
134      *
135      * @param configuration
136      * @exception ConfigurationException
137      * @since
138      */

139     public void configure(Configuration configuration) throws ConfigurationException {
140     
141         Configuration[] children;
142         children = configuration.getChildren(FIELDTAGS_CONFIG);
143         if (children != null && children.length > 0) {
144             fieldTags = new HashSet JavaDoc();
145             for (int i = 0; i < children.length; i++) {
146                 String JavaDoc pattern = children[i].getValue();
147                 String JavaDoc params[] = StringUtils.split(pattern, ", ");
148                 for (int index = 0; index < params.length; index++) {
149                     String JavaDoc tokenized_pattern = params[index];
150                     if (!tokenized_pattern.equals("")) {
151                         this.fieldTags.add(tokenized_pattern);
152                         if (getLogger().isDebugEnabled()) {
153                                 getLogger().debug("add field: " + tokenized_pattern);
154                         }
155                     }
156                 }
157             }
158         } else {
159             if (getLogger().isDebugEnabled()) {
160                 getLogger().debug("Do not add any fields");
161             }
162         }
163         this.contentViewQuery = configuration.getChild(CONTENT_VIEW_QUERY_CONFIG, true).getValue(CONTENT_VIEW_QUERY_DEFAULT);
164                 if (getLogger().isDebugEnabled()) {
165                         getLogger().debug("content view: " + this.contentViewQuery);
166                 }
167     }
168
169
170     /**
171      * Set the current <code>ServiceManager</code> instance used by this
172      * <code>Serviceable</code>.
173      *
174      * @param manager Description of Parameter
175      * @exception ServiceException Description of Exception
176      * @since
177      */

178     public void service(ServiceManager manager) throws ServiceException {
179         this.manager = manager;
180     }
181
182
183     /**
184      * Build lucenen documents from a URL
185      *
186      * @param url the content of this url gets indexed.
187      * @exception ProcessingException Description of Exception
188      * @since
189      */

190     public List JavaDoc build(URL JavaDoc url)
191              throws ProcessingException {
192
193         try {
194             URL JavaDoc contentURL = new URL JavaDoc(url, url.getFile()
195                 + ((url.getFile().indexOf("?") == -1) ? "?" : "&")
196                 + contentViewQuery);
197             URLConnection JavaDoc contentURLConnection = contentURL.openConnection();
198             if (contentURLConnection == null) {
199                 throw new ProcessingException("Can not open connection to URL "
200                         + contentURL + " (null connection)");
201             }
202
203             String JavaDoc contentType = contentURLConnection.getContentType();
204             if (contentType == null) {
205                 if (getLogger().isDebugEnabled()) {
206                     getLogger().debug("Ignoring " + contentURL + " (no content type)");
207                 }
208
209                 return Collections.EMPTY_LIST;
210             }
211
212             int index = contentType.indexOf(';');
213             if (index != -1) {
214                 contentType = contentType.substring(0, index);
215             }
216
217             if (allowedContentType.contains(contentType)) {
218                 if (getLogger().isDebugEnabled()) {
219                     getLogger().debug("Indexing " + contentURL + " (" + contentType + ")");
220                 }
221
222                 LuceneIndexContentHandler luceneIndexContentHandler = new LuceneIndexContentHandler();
223                 luceneIndexContentHandler.setFieldTags(fieldTags);
224                 indexDocument(contentURLConnection, luceneIndexContentHandler);
225                 //
226
// document is parsed
227
//
228
Iterator JavaDoc it = luceneIndexContentHandler.iterator();
229                 while (it.hasNext()) {
230                     Document d = (Document) it.next();
231                     d.add(Field.UnIndexed(URL_FIELD, url.toString()));
232                     // store ... false, index ... true, token ... false
233
d.add(new Field(UID_FIELD, uid(contentURLConnection), false, true, false));
234                 }
235
236                 return luceneIndexContentHandler.allDocuments();
237             } else {
238                 if (getLogger().isDebugEnabled()) {
239                     getLogger().debug("Ignoring " + contentURL + " (" + contentType + ")");
240                 }
241
242                 return Collections.EMPTY_LIST;
243             }
244         } catch (IOException JavaDoc ioe) {
245             throw new ProcessingException("Cannot read URL " + url, ioe);
246         }
247     }
248
249
250     /**
251      * index input stream producing lucene Documents
252      *
253      * @param contentURLConnection the xml content which should get indexed.
254      * @param luceneIndexContentHandler ContentHandler for generating
255      * a lucene Document from XML content.
256      * @exception ProcessingException Description of Exception
257      * @since
258      */

259     private void indexDocument(URLConnection JavaDoc contentURLConnection,
260             LuceneIndexContentHandler luceneIndexContentHandler)
261              throws ProcessingException {
262
263         InputStream JavaDoc is = null;
264         InputSource JavaDoc in = null;
265         SAXParser parser = null;
266
267         try {
268             is = contentURLConnection.getInputStream();
269             in = new InputSource JavaDoc(is);
270
271             // get an XML parser
272
parser = (SAXParser) this.manager.lookup(SAXParser.ROLE);
273             //reader.setErrorHandler(new CocoonErrorHandler());
274
parser.parse(in, luceneIndexContentHandler);
275             //
276
// document is parsed
277
//
278
} catch (IOException JavaDoc ioe) {
279             throw new ProcessingException("Cannot read!", ioe);
280         } catch (SAXException JavaDoc saxe) {
281             throw new ProcessingException("Cannot parse!", saxe);
282         } catch (ServiceException se) {
283             throw new ProcessingException("Cannot lookup xml parser!", se);
284         } finally {
285             if (parser != null) {
286                 this.manager.release(parser);
287             }
288         }
289     }
290
291
292     /**
293      * return a unique uid of a url connection
294      *
295      * @param urlConnection Description of Parameter
296      * @return String unique uid of a urlConnection
297      * @since
298      */

299     private String JavaDoc uid(URLConnection JavaDoc urlConnection) {
300         // Append path and date into a string in such a way that lexicographic
301
// sorting gives the same results as a walk of the file hierarchy. Thus
302
// null (\u0000) is used both to separate directory components and to
303
// separate the path from the date.
304
return urlConnection.toString().replace('/', '\u0000') +
305                 "\u0000" +
306                 DateField.timeToString(urlConnection.getLastModified());
307     }
308 }
309
310
Popular Tags