1 16 package org.apache.cocoon.components.search; 17 18 import org.apache.avalon.framework.configuration.Configurable; 19 import org.apache.avalon.framework.configuration.Configuration; 20 import org.apache.avalon.framework.configuration.ConfigurationException; 21 import org.apache.avalon.framework.logger.AbstractLogEnabled; 22 import org.apache.avalon.framework.service.ServiceException; 23 import org.apache.avalon.framework.service.ServiceManager; 24 import org.apache.avalon.framework.service.Serviceable; 25 import org.apache.avalon.framework.thread.ThreadSafe; 26 import org.apache.cocoon.ProcessingException; 27 import org.apache.commons.lang.StringUtils; 28 import org.apache.excalibur.xml.sax.SAXParser; 29 import org.apache.lucene.document.DateField; 30 import org.apache.lucene.document.Document; 31 import org.apache.lucene.document.Field; 32 import org.xml.sax.InputSource ; 33 import org.xml.sax.SAXException ; 34 35 import java.io.IOException ; 36 import java.io.InputStream ; 37 import java.net.URL ; 38 import java.net.URLConnection ; 39 import java.util.Collections ; 40 import java.util.HashSet ; 41 import java.util.Iterator ; 42 import java.util.List ; 43 44 45 69 public class SimpleLuceneXMLIndexerImpl extends AbstractLogEnabled 70 implements LuceneXMLIndexer, Configurable, Serviceable, ThreadSafe { 71 72 77 protected ServiceManager manager = null; 78 79 88 public final static String CONTENT_VIEW_QUERY_CONFIG = "content-view-query"; 89 90 96 97 final static String CONTENT_VIEW_QUERY_DEFAULT = "cocoon-view=content"; 98 99 107 public final static String FIELDTAGS_CONFIG = "store-fields"; 108 109 114 final HashSet allowedContentType; 115 116 117 120 public SimpleLuceneXMLIndexerImpl() { 121 allowedContentType = new HashSet (); 122 allowedContentType.add("text/xml"); 123 allowedContentType.add("text/xhtml"); 124 fieldTags = new HashSet (); 125 } 126 127 128 private String contentViewQuery = CONTENT_VIEW_QUERY_DEFAULT; 129 private HashSet fieldTags; 130 131 132 139 public void configure(Configuration configuration) throws ConfigurationException { 140 141 Configuration[] children; 142 children = configuration.getChildren(FIELDTAGS_CONFIG); 143 if (children != null && children.length > 0) { 144 fieldTags = new HashSet (); 145 for (int i = 0; i < children.length; i++) { 146 String pattern = children[i].getValue(); 147 String params[] = StringUtils.split(pattern, ", "); 148 for (int index = 0; index < params.length; index++) { 149 String tokenized_pattern = params[index]; 150 if (!tokenized_pattern.equals("")) { 151 this.fieldTags.add(tokenized_pattern); 152 if (getLogger().isDebugEnabled()) { 153 getLogger().debug("add field: " + tokenized_pattern); 154 } 155 } 156 } 157 } 158 } else { 159 if (getLogger().isDebugEnabled()) { 160 getLogger().debug("Do not add any fields"); 161 } 162 } 163 this.contentViewQuery = configuration.getChild(CONTENT_VIEW_QUERY_CONFIG, true).getValue(CONTENT_VIEW_QUERY_DEFAULT); 164 if (getLogger().isDebugEnabled()) { 165 getLogger().debug("content view: " + this.contentViewQuery); 166 } 167 } 168 169 170 178 public void service(ServiceManager manager) throws ServiceException { 179 this.manager = manager; 180 } 181 182 183 190 public List build(URL url) 191 throws ProcessingException { 192 193 try { 194 URL contentURL = new URL (url, url.getFile() 195 + ((url.getFile().indexOf("?") == -1) ? "?" : "&") 196 + contentViewQuery); 197 URLConnection contentURLConnection = contentURL.openConnection(); 198 if (contentURLConnection == null) { 199 throw new ProcessingException("Can not open connection to URL " 200 + contentURL + " (null connection)"); 201 } 202 203 String contentType = contentURLConnection.getContentType(); 204 if (contentType == null) { 205 if (getLogger().isDebugEnabled()) { 206 getLogger().debug("Ignoring " + contentURL + " (no content type)"); 207 } 208 209 return Collections.EMPTY_LIST; 210 } 211 212 int index = contentType.indexOf(';'); 213 if (index != -1) { 214 contentType = contentType.substring(0, index); 215 } 216 217 if (allowedContentType.contains(contentType)) { 218 if (getLogger().isDebugEnabled()) { 219 getLogger().debug("Indexing " + contentURL + " (" + contentType + ")"); 220 } 221 222 LuceneIndexContentHandler luceneIndexContentHandler = new LuceneIndexContentHandler(); 223 luceneIndexContentHandler.setFieldTags(fieldTags); 224 indexDocument(contentURLConnection, luceneIndexContentHandler); 225 Iterator it = luceneIndexContentHandler.iterator(); 229 while (it.hasNext()) { 230 Document d = (Document) it.next(); 231 d.add(Field.UnIndexed(URL_FIELD, url.toString())); 232 d.add(new Field(UID_FIELD, uid(contentURLConnection), false, true, false)); 234 } 235 236 return luceneIndexContentHandler.allDocuments(); 237 } else { 238 if (getLogger().isDebugEnabled()) { 239 getLogger().debug("Ignoring " + contentURL + " (" + contentType + ")"); 240 } 241 242 return Collections.EMPTY_LIST; 243 } 244 } catch (IOException ioe) { 245 throw new ProcessingException("Cannot read URL " + url, ioe); 246 } 247 } 248 249 250 259 private void indexDocument(URLConnection contentURLConnection, 260 LuceneIndexContentHandler luceneIndexContentHandler) 261 throws ProcessingException { 262 263 InputStream is = null; 264 InputSource in = null; 265 SAXParser parser = null; 266 267 try { 268 is = contentURLConnection.getInputStream(); 269 in = new InputSource (is); 270 271 parser = (SAXParser) this.manager.lookup(SAXParser.ROLE); 273 parser.parse(in, luceneIndexContentHandler); 275 } catch (IOException ioe) { 279 throw new ProcessingException("Cannot read!", ioe); 280 } catch (SAXException saxe) { 281 throw new ProcessingException("Cannot parse!", saxe); 282 } catch (ServiceException se) { 283 throw new ProcessingException("Cannot lookup xml parser!", se); 284 } finally { 285 if (parser != null) { 286 this.manager.release(parser); 287 } 288 } 289 } 290 291 292 299 private String uid(URLConnection urlConnection) { 300 return urlConnection.toString().replace('/', '\u0000') + 305 "\u0000" + 306 DateField.timeToString(urlConnection.getLastModified()); 307 } 308 } 309 310 | Popular Tags |