1 17 18 19 20 package org.apache.lenya.lucene; 21 22 import java.io.BufferedReader ; 23 import java.io.File ; 24 import java.io.FileInputStream ; 25 import java.io.FileNotFoundException ; 26 import java.io.IOException ; 27 import java.io.InputStreamReader ; 28 import java.nio.charset.Charset ; 29 import java.nio.charset.IllegalCharsetNameException ; 30 import java.util.StringTokenizer ; 31 32 import org.apache.lenya.lucene.html.HTMLParser; 33 import org.apache.log4j.Category; 34 import org.apache.lucene.analysis.Token; 35 import org.apache.lucene.analysis.TokenStream; 36 import org.apache.lucene.analysis.standard.StandardAnalyzer; 37 38 39 42 public class ReTokenizeFile { 43 private static final Category log = Category.getInstance(ReTokenizeFile.class); 44 45 private int offset = 100; 46 47 52 public static void main(String [] args) { 53 if (args.length < 2) { 54 System.err.println("Usage: ReTokenizeFile filename word1 word2 ..."); 55 56 return; 57 } 58 59 try { 60 String [] words = new String [args.length - 1]; 62 for (int i = 1; i < args.length; i++) { 63 words[i - 1] = args[i]; 64 } 65 66 String s = null; 67 68 s = new ReTokenizeFile().getExcerpt(new File (args[0]), words); 69 System.err.println(".main(): Excerpt: " + s); 70 } catch (Exception e) { 71 System.err.println(".main(): " + e); 72 } 73 } 74 75 84 public String reTokenize(File file) throws Exception { 85 TokenStream ts = new StandardAnalyzer().tokenStream(new HTMLParser(file).getReader()); 86 87 Token token = null; 88 89 while ((token = ts.next()) != null) { 90 System.out.println("ReTokenizeFile.reTokenize(File): " + token.termText() + " " + 91 token.startOffset() + " " + token.endOffset() + " " + token.type()); 92 } 93 94 return file.getAbsolutePath(); 95 } 96 97 100 public String getExcerpt(File file, String [] words) 101 throws FileNotFoundException , IOException { 102 if (file.getName().substring(file.getName().length() - 4).equals(".pdf")) { 103 file = new File (file.getAbsolutePath() + ".txt"); 104 } 105 106 String content = readFileWithEncoding(file); 107 108 110 content = removeTags(content); 111 112 114 115 123 124 127 128 int index = -1; 129 130 for (int i = 0; i < words.length; i++) { 131 index = content.toLowerCase().indexOf(words[i].toLowerCase()); 132 133 if (index >= 0) { 134 int start = index - offset; 135 136 if (start < 0) { 137 start = 0; 138 } 139 140 int end = index + words[i].length() + offset; 141 142 if (end >= content.length()) { 143 end = content.length() - 1; 144 } 145 146 return content.substring(start, end); 147 } 148 } 149 150 return null; 151 } 152 153 160 public String removeTags(String string) { 161 StringBuffer sb = new StringBuffer (""); 162 163 boolean tag = false; 164 165 for (int i = 0; i < string.length(); i++) { 166 char ch = string.charAt(i); 167 if (ch == '<') { 168 tag = true; 169 } else if (ch == '>') { 170 tag = false; 171 } else { 172 if (!tag) sb.append(string.charAt(i)); 173 } 174 } 175 176 return sb.toString(); 177 } 178 179 186 public String tidy(String string) { 187 StringTokenizer st = new StringTokenizer (string, "<>&"); 188 189 StringBuffer sb = new StringBuffer (""); 190 191 while (st.hasMoreElements()) { 192 sb.append(st.nextToken()); 193 } 194 195 return sb.toString(); 196 } 197 198 207 public String emphasizeAsXML(String string, String [] words) { 208 String emphasizedString = "... Hello <word>World</word>! ..."; 209 210 String lowerCaseString = string.toLowerCase(); 211 212 for (int i = 0; i < words.length; i++) { 213 String word = words[i].toLowerCase(); 214 215 lowerCaseString = lowerCaseString.replaceAll(word, "<WORD>" + word + "</WORD>"); 217 } 218 219 lowerCaseString = lowerCaseString.toLowerCase(); 220 221 String result = ""; 223 224 int sourceIndex = 0; 225 int index = 0; 226 String [] tags = { "<word>", "</word>" }; 227 228 while (lowerCaseString.indexOf(tags[0], index) != -1) { 229 for (int tag = 0; tag < 2; tag++) { 230 int subStringLength = lowerCaseString.indexOf(tags[tag], index) - index; 231 String subString = string.substring(sourceIndex, sourceIndex + subStringLength); 232 result += (includeInCDATA(subString) + tags[tag]); 233 sourceIndex += subStringLength; 234 index += (subStringLength + tags[tag].length()); 235 } 236 } 237 238 result += includeInCDATA(string.substring(sourceIndex)); 239 240 return "<excerpt>" + result + "</excerpt>"; 241 } 242 243 246 protected String includeInCDATA(String string) { 247 return "<![CDATA[" + string + "]]>"; 248 } 249 250 256 protected String readFileWithEncoding(File file) throws FileNotFoundException , IOException { 257 String content = readHtmlFile(file); 258 int endOfFirstTag = content.indexOf(">"); 260 if(endOfFirstTag > 0 && content.charAt(endOfFirstTag-1) == '?') { 261 String upperLine = content.substring(0, endOfFirstTag).toUpperCase(); 262 int encStart = upperLine.indexOf("ENCODING=")+10; 263 int encEnd = -1; 264 265 if (encStart > 0) { 266 encEnd = upperLine.indexOf("\"", encStart); 267 if (encEnd == -1) { 268 encEnd = upperLine.indexOf("\'", encStart); 269 } 270 } 271 if(encStart > 0 && encEnd > 0) { 272 String xmlCharset = upperLine.substring(encStart, encEnd); 273 try { 274 if (Charset.isSupported(xmlCharset)) { 275 content = readFile(file, Charset.forName(xmlCharset)); 276 } 277 } catch (IllegalCharsetNameException e) { 278 } 280 } 281 } 282 return content; 283 } 284 285 286 293 protected String readHtmlFile(File file) throws FileNotFoundException , IOException { 294 java.io.Reader reader = new HTMLParser(file).getReader(); 295 char[] chars = new char[1024]; 296 int chars_read; 297 java.io.Writer writer = new java.io.StringWriter (); 298 299 while ((chars_read = reader.read(chars)) > 0) { 300 writer.write(chars, 0, chars_read); 301 } 302 return writer.toString(); 303 } 304 305 313 protected String readFile(File file, Charset charset) throws FileNotFoundException , IOException { 314 FileInputStream inputFile = new FileInputStream (file); 315 InputStreamReader inputStream; 316 if(charset != null) { 317 inputStream = new InputStreamReader (inputFile, charset); 318 } else { 319 inputStream = new InputStreamReader (inputFile); 320 } 321 BufferedReader bufferReader = new BufferedReader (inputStream); 322 StringBuffer buffer = new StringBuffer (); 323 String line = ""; 324 while (bufferReader.ready()) { 325 line = bufferReader.readLine(); 326 buffer.append(line); 327 } 328 bufferReader.close(); 329 inputStream.close(); 330 inputFile.close(); 331 return buffer.toString(); 332 } 333 334 337 public void setOffset(int offset) { 338 this.offset = offset; 339 } 340 } 341 | Popular Tags |