1 17 18 19 20 package org.apache.lenya.lucene.index; 21 22 import java.io.File ; 23 import java.io.FileWriter ; 24 import java.io.IOException ; 25 import java.io.Reader ; 26 import java.io.StringReader ; 27 import java.io.StringWriter ; 28 import java.io.Writer ; 29 import java.lang.reflect.Method ; 30 31 import javax.xml.parsers.DocumentBuilder ; 32 import javax.xml.parsers.DocumentBuilderFactory ; 33 import javax.xml.transform.OutputKeys ; 34 import javax.xml.transform.Transformer ; 35 import javax.xml.transform.TransformerFactory ; 36 import javax.xml.transform.dom.DOMSource ; 37 import javax.xml.transform.stream.StreamResult ; 38 import javax.xml.transform.stream.StreamSource ; 39 40 import org.apache.lenya.lucene.parser.HTMLParser; 41 import org.apache.lenya.lucene.parser.HTMLParserFactory; 42 import org.apache.lenya.lucene.parser.StringCleaner; 43 import org.apache.lenya.xml.DocumentHelper; 44 import org.apache.lenya.xml.NamespaceHelper; 45 import org.apache.log4j.Category; 46 import org.apache.lucene.document.Document; 47 import org.apache.lucene.document.Field; 48 import org.w3c.dom.Element ; 49 import org.w3c.dom.Node ; 50 import org.w3c.dom.NodeList ; 51 import org.xml.sax.InputSource ; 52 53 56 public class ConfigurableDocumentCreator extends AbstractDocumentCreator { 57 Category log = Category.getInstance(ConfigurableDocumentCreator.class); 58 59 public static final String LUCENE_NAMESPACE = "http://apache.org/cocoon/lenya/lucene/1.0"; 60 public static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"; 61 62 67 public ConfigurableDocumentCreator(String stylesheet) { 68 this.stylesheet = stylesheet; 69 } 70 71 private String stylesheet; 72 73 78 public String getStylesheet() { 79 return stylesheet; 80 } 81 82 92 public Document getDocument(File file, File htdocsDumpDir) throws Exception { 93 log.debug(".getDocument() : indexing " + file.getAbsolutePath()); 94 try { 95 96 org.w3c.dom.Document sourceDocument = null; 97 DocumentBuilderFactory parserFactory = DocumentBuilderFactory.newInstance(); 98 parserFactory.setValidating(false); 99 parserFactory.setNamespaceAware(true); 100 parserFactory.setIgnoringElementContentWhitespace(true); 101 DocumentBuilder mybuilder = parserFactory.newDocumentBuilder(); 102 sourceDocument = mybuilder.parse(file.getAbsolutePath()); 103 104 105 116 117 118 119 120 DOMSource documentSource = new DOMSource (sourceDocument); 121 Writer documentWriter = new StringWriter (); 122 123 TransformerFactory tFactory = TransformerFactory.newInstance(); 124 Transformer documentTransformer = tFactory.newTransformer(new StreamSource (new StringReader (getStylesheet()))); 125 documentTransformer.setOutputProperty(OutputKeys.INDENT, "yes"); 126 documentTransformer.setOutputProperty(OutputKeys.ENCODING, "ISO-8859-1"); 127 128 String fileName = file.getName(); 129 130 if (fileName.endsWith(".pdf.txt")) { 131 fileName = fileName.substring(0, fileName.lastIndexOf(".txt")); 132 } 133 134 documentTransformer.setParameter("filename", fileName); 135 documentTransformer.transform(documentSource, new StreamResult (documentWriter)); 136 137 140 DocumentBuilder builder = DocumentHelper.createBuilder(); 141 org.w3c.dom.Document luceneDocument = builder.parse(new InputSource (new StringReader (documentWriter.toString()))); 142 143 NamespaceHelper helper = new NamespaceHelper(LUCENE_NAMESPACE, "luc", luceneDocument); 144 Element root = luceneDocument.getDocumentElement(); 145 Element [] fieldElements = helper.getChildren(root, "field"); 146 147 Document document = super.getDocument(file, htdocsDumpDir); 148 149 Class [] parameterTypes = { String .class, String .class }; 150 151 for (int i = 0; i < fieldElements.length; i++) { 152 String name = fieldElements[i].getAttribute("name"); 153 String type = fieldElements[i].getAttribute("type"); 154 String text = getText(fieldElements[i]); 155 156 Method method = Field.class.getMethod(type, parameterTypes); 157 158 String [] args = { name, text }; 159 160 Field field = (Field) method.invoke(null, args); 161 document.add(field); 162 163 } 164 165 return document; 166 } catch (Exception e) { 167 throw e; 168 } 169 } 170 171 174 protected void dumpLuceneDocument(File file, Writer writer) throws IOException { 175 log.debug(".dumpLuceneDocument(): Dump document: " + file.getAbsolutePath()); 176 177 File luceneDocumentFile = new File (file.getAbsolutePath() + ".xluc"); 178 luceneDocumentFile.createNewFile(); 179 180 FileWriter fileWriter = new FileWriter (luceneDocumentFile); 181 fileWriter.write(writer.toString()); 182 fileWriter.close(); 183 } 184 185 192 public static String getText(Node node) { 193 StringBuffer result = new StringBuffer (); 194 195 if (!node.hasChildNodes()) { 196 return ""; 197 } 198 199 NodeList list = node.getChildNodes(); 200 201 for (int i = 0; i < list.getLength(); i++) { 202 Node subnode = list.item(i); 203 204 if (subnode.getNodeType() == Node.TEXT_NODE) { 205 result.append(subnode.getNodeValue()); 206 } else if (subnode.getNodeType() == Node.CDATA_SECTION_NODE) { 207 result.append(subnode.getNodeValue()); 208 } else if (subnode.getNodeType() == Node.ENTITY_REFERENCE_NODE) { 209 result.append(getText(subnode)); 212 } 213 } 214 215 return result.toString(); 216 } 217 218 227 public static String getBodyText(File file) throws Exception { 228 HTMLParser parser = HTMLParserFactory.newInstance(file); 229 parser.parse(file); 230 231 Reader reader = parser.getReader(); 232 Writer writer = new StringWriter (); 233 234 int c; 235 236 while ((c = reader.read()) != -1) 237 writer.write(c); 238 239 String content = writer.toString(); 240 reader.close(); 241 writer.close(); 242 243 content = StringCleaner.clean(content); 244 245 return content; 246 } 247 } 248 | Popular Tags |