KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > search > documents > A_CmsVfsDocument


1 /*
2  * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/documents/A_CmsVfsDocument.java,v $
3  * Date : $Date: 2006/03/27 14:53:05 $
4  * Version: $Revision: 1.14 $
5  *
6  * This library is part of OpenCms -
7  * the Open Source Content Mananagement System
8  *
9  * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * For further information about Alkacon Software GmbH, please see the
22  * company website: http://www.alkacon.com
23  *
24  * For further information about OpenCms, please see the
25  * project website: http://www.opencms.org
26  *
27  * You should have received a copy of the GNU Lesser General Public
28  * License along with this library; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30  */

31
32 package org.opencms.search.documents;
33
34 import org.opencms.file.CmsFile;
35 import org.opencms.file.CmsObject;
36 import org.opencms.file.CmsPropertyDefinition;
37 import org.opencms.file.CmsResource;
38 import org.opencms.file.types.I_CmsResourceType;
39 import org.opencms.main.CmsException;
40 import org.opencms.main.CmsLog;
41 import org.opencms.main.OpenCms;
42 import org.opencms.search.A_CmsIndexResource;
43 import org.opencms.search.CmsIndexException;
44 import org.opencms.search.CmsSearchCategoryCollector;
45 import org.opencms.search.CmsSearchIndex;
46 import org.opencms.search.extractors.I_CmsExtractionResult;
47 import org.opencms.util.CmsStringUtil;
48
49 import java.util.ArrayList JavaDoc;
50 import java.util.Date JavaDoc;
51 import java.util.Iterator JavaDoc;
52 import java.util.List JavaDoc;
53 import java.util.Map JavaDoc;
54
55 import org.apache.commons.logging.Log;
56 import org.apache.lucene.document.DateTools;
57 import org.apache.lucene.document.Document;
58 import org.apache.lucene.document.Field;
59
60 /**
61  * Base document factory class for a VFS <code>{@link org.opencms.file.CmsResource}</code>,
62  * just requires a specialized implementation of
63  * <code>{@link I_CmsDocumentFactory#extractContent(CmsObject, A_CmsIndexResource, String)}</code>
64  * for text extraction from the binary document content.<p>
65  *
66  * @author Carsten Weinholz
67  * @author Alexander Kandzior
68  *
69  * @version $Revision: 1.14 $
70  *
71  * @since 6.0.0
72  */

73 public abstract class A_CmsVfsDocument implements I_CmsDocumentFactory {
74
75     /** The vfs prefix for document keys. */
76     public static final String JavaDoc VFS_DOCUMENT_KEY_PREFIX = "VFS";
77
78     /** The log object for this class. */
79     private static final Log LOG = CmsLog.getLog(A_CmsVfsDocument.class);
80
81     /**
82      * Name of the documenttype.
83      */

84     protected String JavaDoc m_name;
85
86     /**
87      * Creates a new instance of this lucene document factory.<p>
88      *
89      * @param name name of the documenttype
90      */

91     public A_CmsVfsDocument(String JavaDoc name) {
92
93         m_name = name;
94     }
95
96     /**
97      * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKey(java.lang.String)
98      */

99     public String JavaDoc getDocumentKey(String JavaDoc resourceType) throws CmsIndexException {
100
101         try {
102             return VFS_DOCUMENT_KEY_PREFIX + ((I_CmsResourceType)Class.forName(resourceType).newInstance()).getTypeId();
103         } catch (Exception JavaDoc exc) {
104             throw new CmsIndexException(Messages.get().container(
105                 Messages.ERR_RESOURCE_TYPE_INSTANTIATION_1,
106                 resourceType), exc);
107         }
108     }
109
110     /**
111      * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List)
112      */

113     public List JavaDoc getDocumentKeys(List JavaDoc resourceTypes, List JavaDoc mimeTypes) throws CmsException {
114
115         ArrayList JavaDoc keys = new ArrayList JavaDoc();
116
117         if (resourceTypes.contains("*")) {
118             ArrayList JavaDoc allTypes = new ArrayList JavaDoc();
119             for (Iterator JavaDoc i = OpenCms.getResourceManager().getResourceTypes().iterator(); i.hasNext();) {
120                 I_CmsResourceType resourceType = (I_CmsResourceType)i.next();
121                 allTypes.add(resourceType.getTypeName());
122             }
123             resourceTypes = allTypes;
124         }
125
126         try {
127             for (Iterator JavaDoc i = resourceTypes.iterator(); i.hasNext();) {
128
129                 int id = OpenCms.getResourceManager().getResourceType((String JavaDoc)i.next()).getTypeId();
130                 for (Iterator JavaDoc j = mimeTypes.iterator(); j.hasNext();) {
131                     keys.add(VFS_DOCUMENT_KEY_PREFIX + id + ":" + (String JavaDoc)j.next());
132                 }
133                 if (mimeTypes.isEmpty()) {
134                     keys.add(VFS_DOCUMENT_KEY_PREFIX + id);
135                 }
136             }
137         } catch (Exception JavaDoc exc) {
138             throw new CmsException(Messages.get().container(Messages.ERR_CREATE_DOC_KEY_0), exc);
139         }
140
141         return keys;
142     }
143
144     /**
145      * @see org.opencms.search.documents.I_CmsDocumentFactory#getName()
146      */

147     public String JavaDoc getName() {
148
149         return m_name;
150     }
151
152     /**
153      * Generates a new lucene document instance from contents of the given resource.<p>
154      *
155      * @see org.opencms.search.documents.I_CmsDocumentFactory#newInstance(org.opencms.file.CmsObject, org.opencms.search.A_CmsIndexResource, java.lang.String)
156      */

157     public Document newInstance(CmsObject cms, A_CmsIndexResource resource, String JavaDoc language) throws CmsException {
158
159         Document document = new Document();
160         CmsResource res = (CmsResource)resource.getData();
161         String JavaDoc path = cms.getRequestContext().removeSiteRoot(resource.getRootPath());
162
163         // extract the content from the resource
164
String JavaDoc text = null;
165         try {
166             I_CmsExtractionResult content = extractContent(cms, resource, language);
167             text = mergeMetaInfo(content);
168             content.release();
169         } catch (Exception JavaDoc e) {
170             // text extraction failed for document - continue indexing meta information only
171
LOG.error(Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()), e);
172         }
173         if (text != null) {
174             document.add(new Field(I_CmsDocumentFactory.DOC_CONTENT, text, Field.Store.YES, Field.Index.TOKENIZED));
175         }
176
177         StringBuffer JavaDoc meta = new StringBuffer JavaDoc(512);
178         String JavaDoc value;
179         Field field;
180
181         // add the title from the property
182
value = cms.readPropertyObject(path, CmsPropertyDefinition.PROPERTY_TITLE, false).getValue();
183         if (CmsStringUtil.isNotEmpty(value)) {
184             value = value.trim();
185             if (value.length() > 0) {
186                 // add title as keyword, required for sorting
187
field = new Field(I_CmsDocumentFactory.DOC_TITLE_KEY, value, Field.Store.YES, Field.Index.UN_TOKENIZED);
188                 // title keyword field should not affect the boost factor
189
field.setBoost(0);
190                 document.add(field);
191                 // add title again as indexed field for searching
192
document.add(new Field(
193                     I_CmsDocumentFactory.DOC_TITLE_INDEXED,
194                     value,
195                     Field.Store.NO,
196                     Field.Index.TOKENIZED));
197                 meta.append(value);
198                 meta.append(" ");
199             }
200         }
201         // add the keywords from the property
202
value = cms.readPropertyObject(path, CmsPropertyDefinition.PROPERTY_KEYWORDS, false).getValue();
203         if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(value)) {
204             document.add(new Field(I_CmsDocumentFactory.DOC_KEYWORDS, value, Field.Store.YES, Field.Index.TOKENIZED));
205             meta.append(value);
206             meta.append(" ");
207         }
208         // add the description from the property
209
value = cms.readPropertyObject(path, CmsPropertyDefinition.PROPERTY_DESCRIPTION, false).getValue();
210         if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(value)) {
211             document.add(new Field(I_CmsDocumentFactory.DOC_DESCRIPTION, value, Field.Store.YES, Field.Index.TOKENIZED));
212             meta.append(value);
213             meta.append(" ");
214         }
215         // add the collected meta information
216
String JavaDoc metaInf = meta.toString();
217         if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(metaInf)) {
218             document.add(new Field(I_CmsDocumentFactory.DOC_META, metaInf, Field.Store.NO, Field.Index.TOKENIZED));
219         }
220
221         // add the category of the file (this is searched so the value can also be attached on a folder)
222
value = cms.readPropertyObject(path, CmsPropertyDefinition.PROPERTY_SEARCH_CATEGORY, true).getValue();
223         if (CmsStringUtil.isNotEmpty(value)) {
224             // all categorys are internally stored lower case
225
value = value.trim().toLowerCase();
226             if (value.length() > 0) {
227                 field = new Field(I_CmsDocumentFactory.DOC_CATEGORY, value, Field.Store.YES, Field.Index.UN_TOKENIZED);
228                 field.setBoost(0);
229                 document.add(field);
230             }
231         } else {
232             // synthetic "unknown" category if no category property defined for resource
233
field = new Field(
234                 I_CmsDocumentFactory.DOC_CATEGORY,
235                 CmsSearchCategoryCollector.UNKNOWN_CATEGORY,
236                 Field.Store.YES,
237                 Field.Index.UN_TOKENIZED);
238             document.add(field);
239         }
240
241         // add the document root path, optimized for use with a phrase query
242
String JavaDoc rootPath = CmsSearchIndex.rootPathRewrite(resource.getRootPath());
243         field = new Field(I_CmsDocumentFactory.DOC_ROOT, rootPath, Field.Store.YES, Field.Index.TOKENIZED);
244         // set boost of 0 to root path field, since root path should have no effect on search result score
245
field.setBoost(0);
246         document.add(field);
247         // root path is stored again in "plain" format, but not for indexing since I_CmsDocumentFactory.DOC_ROOT is used for that
248
// must be indexed as a keyword ONLY to be able to use this when deleting a resource from the index
249
document.add(new Field(
250             I_CmsDocumentFactory.DOC_PATH,
251             resource.getRootPath(),
252             Field.Store.YES,
253             Field.Index.UN_TOKENIZED));
254
255         // add date of creation and last modification as keywords (for sorting)
256
field = new Field(I_CmsDocumentFactory.DOC_DATE_CREATED, DateTools.dateToString(
257             new Date JavaDoc(res.getDateCreated()),
258             DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.UN_TOKENIZED);
259         field.setBoost(0);
260         document.add(field);
261         field = new Field(I_CmsDocumentFactory.DOC_DATE_LASTMODIFIED, DateTools.dateToString(new Date JavaDoc(
262             res.getDateLastModified()), DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.UN_TOKENIZED);
263         field.setBoost(0);
264         document.add(field);
265
266         // special field for VFS documents - add a marker so that the document can be identified as VFS resource
267
document.add(new Field(I_CmsDocumentFactory.DOC_TYPE, VFS_DOCUMENT_KEY_PREFIX, Field.Store.YES, Field.Index.NO));
268
269         float boost = 1.0f;
270         // note that the priority property IS searched, so you can easily flag whole folders as "high" or "low"
271
value = cms.readPropertyObject(path, CmsPropertyDefinition.PROPERTY_SEARCH_PRIORITY, true).getValue();
272         if (value != null) {
273             value = value.trim().toLowerCase();
274             if (value.equals(I_CmsDocumentFactory.SEARCH_PRIORITY_MAX_VALUE)) {
275                 boost = 2.0f;
276             } else if (value.equals(I_CmsDocumentFactory.SEARCH_PRIORITY_HIGH_VALUE)) {
277                 boost = 1.5f;
278             } else if (value.equals(I_CmsDocumentFactory.SEARCH_PRIORITY_LOW_VALUE)) {
279                 boost = 0.5f;
280             }
281         }
282         // set document boost factor
283
document.setBoost(boost);
284
285         return document;
286     }
287
288     /**
289      * Returns a String created out of the content and the most important meta information in the given
290      * extraction result.<p>
291      *
292      * OpenCms uses it's own properties for the text "Title" etc. field, this method ensures
293      * the most important document meta information can still be found as part of the content.<p>
294      *
295      * @param extractedContent the extraction result to merge
296      *
297      * @return a String created out of the most important meta information in the given map and the content
298      */

299     protected String JavaDoc mergeMetaInfo(I_CmsExtractionResult extractedContent) {
300
301         Map JavaDoc metaInfo = extractedContent.getMetaInfo();
302         String JavaDoc content = extractedContent.getContent();
303
304         if (((metaInfo == null) || (metaInfo.size() == 0)) && (CmsStringUtil.isEmpty(content))) {
305             return null;
306         }
307
308         StringBuffer JavaDoc result = new StringBuffer JavaDoc(4096);
309         if (metaInfo != null) {
310             String JavaDoc meta;
311             meta = (String JavaDoc)metaInfo.get(I_CmsExtractionResult.META_TITLE);
312             if (CmsStringUtil.isNotEmpty(meta)) {
313                 result.append(meta);
314                 result.append('\n');
315             }
316             meta = (String JavaDoc)metaInfo.get(I_CmsExtractionResult.META_SUBJECT);
317             if (CmsStringUtil.isNotEmpty(meta)) {
318                 result.append(meta);
319                 result.append('\n');
320             }
321             meta = (String JavaDoc)metaInfo.get(I_CmsExtractionResult.META_KEYWORDS);
322             if (CmsStringUtil.isNotEmpty(meta)) {
323                 result.append(meta);
324                 result.append('\n');
325             }
326             meta = (String JavaDoc)metaInfo.get(I_CmsExtractionResult.META_COMMENTS);
327             if (CmsStringUtil.isNotEmpty(meta)) {
328                 result.append(meta);
329                 result.append('\n');
330             }
331         }
332
333         if (content != null) {
334             result.append(content);
335         }
336
337         return result.toString();
338     }
339
340     /**
341      * Upgrades the given resource to a {@link CmsFile} with content.<p>
342      *
343      * @param cms the current users OpenCms context
344      * @param resource the resource to upgrade
345      *
346      * @return the given resource upgraded to a {@link CmsFile} with content
347      *
348      * @throws CmsException if the resource could not be read
349      * @throws CmsIndexException if the resource has no content
350      */

351     protected CmsFile readFile(CmsObject cms, CmsResource resource) throws CmsException, CmsIndexException {
352
353         CmsFile file = CmsFile.upgrade(resource, cms);
354         if (file.getLength() <= 0) {
355             throw new CmsIndexException(Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath()));
356         }
357         return file;
358     }
359 }
Popular Tags