KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lenya > lucene > ReTokenizeFile


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */

17
18 /* $Id: ReTokenizeFile.java 43119 2004-07-12 10:21:37Z michi $ */
19
20 package org.apache.lenya.lucene;
21
22 import java.io.BufferedReader JavaDoc;
23 import java.io.File JavaDoc;
24 import java.io.FileInputStream JavaDoc;
25 import java.io.FileNotFoundException JavaDoc;
26 import java.io.IOException JavaDoc;
27 import java.io.InputStreamReader JavaDoc;
28 import java.nio.charset.Charset JavaDoc;
29 import java.nio.charset.IllegalCharsetNameException JavaDoc;
30 import java.util.StringTokenizer JavaDoc;
31
32 import org.apache.lenya.lucene.html.HTMLParser;
33 import org.apache.log4j.Category;
34 import org.apache.lucene.analysis.Token;
35 import org.apache.lucene.analysis.TokenStream;
36 import org.apache.lucene.analysis.standard.StandardAnalyzer;
37
38
39 /**
40  * DOCUMENT ME!
41  */

42 public class ReTokenizeFile {
43     private static final Category log = Category.getInstance(ReTokenizeFile.class);
44
45     private int offset = 100;
46     
47     /**
48      * DOCUMENT ME!
49      *
50      * @param args DOCUMENT ME!
51      */

52     public static void main(String JavaDoc[] args) {
53         if (args.length < 2) {
54             System.err.println("Usage: ReTokenizeFile filename word1 word2 ...");
55
56             return;
57         }
58
59         try {
60             String JavaDoc[] words = new String JavaDoc[args.length - 1]; //{"Cocoon","Lenya"};
61

62             for (int i = 1; i < args.length; i++) {
63                 words[i - 1] = args[i];
64             }
65
66             String JavaDoc s = null;
67
68             s = new ReTokenizeFile().getExcerpt(new File JavaDoc(args[0]), words);
69             System.err.println(".main(): Excerpt: " + s);
70         } catch (Exception JavaDoc e) {
71             System.err.println(".main(): " + e);
72         }
73     }
74
75     /**
76      * DOCUMENT ME!
77      *
78      * @param file DOCUMENT ME!
79      *
80      * @return DOCUMENT ME!
81      *
82      * @throws Exception DOCUMENT ME!
83      */

84     public String JavaDoc reTokenize(File JavaDoc file) throws Exception JavaDoc {
85         TokenStream ts = new StandardAnalyzer().tokenStream(new HTMLParser(file).getReader());
86
87         Token token = null;
88
89         while ((token = ts.next()) != null) {
90             System.out.println("ReTokenizeFile.reTokenize(File): " + token.termText() + " " +
91                 token.startOffset() + " " + token.endOffset() + " " + token.type());
92         }
93
94         return file.getAbsolutePath();
95     }
96
97     /**
98      *
99      */

100     public String JavaDoc getExcerpt(File JavaDoc file, String JavaDoc[] words)
101         throws FileNotFoundException JavaDoc, IOException JavaDoc {
102         if (file.getName().substring(file.getName().length() - 4).equals(".pdf")) {
103             file = new File JavaDoc(file.getAbsolutePath() + ".txt");
104         }
105         
106         String JavaDoc content = readFileWithEncoding(file);
107
108     //log.debug(content);
109

110     content = removeTags(content);
111
112     //log.debug(content);
113

114         
115         /*java.io.Reader reader = new HTMLParser(file).getReader();
116         char[] chars = new char[1024];
117         int chars_read;
118         java.io.Writer writer = new java.io.StringWriter();
119
120         while ((chars_read = reader.read(chars)) > 0) {
121             writer.write(chars, 0, chars_read);
122         }*/

123
124         //String html = writer.toString();
125
//html = writer.toString();
126

127         
128         int index = -1;
129
130         for (int i = 0; i < words.length; i++) {
131             index = content.toLowerCase().indexOf(words[i].toLowerCase());
132
133             if (index >= 0) {
134                 int start = index - offset;
135
136                 if (start < 0) {
137                     start = 0;
138                 }
139
140                 int end = index + words[i].length() + offset;
141
142                 if (end >= content.length()) {
143                     end = content.length() - 1;
144                 }
145
146                 return content.substring(start, end);
147             }
148         }
149
150         return null;
151     }
152
153     /**
154      * Remove tags
155      *
156      * @param string Content with tags
157      *
158      * @return Content without tags
159      */

160     public String JavaDoc removeTags(String JavaDoc string) {
161         StringBuffer JavaDoc sb = new StringBuffer JavaDoc("");
162
163         boolean tag = false;
164
165         for (int i = 0; i < string.length(); i++) {
166             char ch = string.charAt(i);
167         if (ch == '<') {
168                 tag = true;
169             } else if (ch == '>') {
170                 tag = false;
171             } else {
172                 if (!tag) sb.append(string.charAt(i));
173             }
174         }
175
176         return sb.toString();
177     }
178
179     /**
180      * Is being used by search-and-results.xsp. Is this really still necessary?
181      *
182      * @param string content
183      *
184      * @return content without <>&
185      */

186     public String JavaDoc tidy(String JavaDoc string) {
187         StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(string, "<>&");
188
189         StringBuffer JavaDoc sb = new StringBuffer JavaDoc("");
190
191         while (st.hasMoreElements()) {
192             sb.append(st.nextToken());
193         }
194
195         return sb.toString();
196     }
197
198     /**
199      * Encloses all words in <code>words</code> that appear in <code>string</code> in
200      * &lt;word&gt; tags. The whole string is enclosed in &lt;excerpt&gt; tags.
201      *
202      * @param string The string to process.
203      * @param words The words to emphasize.
204      *
205      * @return DOCUMENT ME!
206      */

207     public String JavaDoc emphasizeAsXML(String JavaDoc string, String JavaDoc[] words) {
208         String JavaDoc emphasizedString = "... Hello <word>World</word>! ...";
209
210         String JavaDoc lowerCaseString = string.toLowerCase();
211
212         for (int i = 0; i < words.length; i++) {
213             String JavaDoc word = words[i].toLowerCase();
214
215             // use uppercase tags so that they are not replaced
216
lowerCaseString = lowerCaseString.replaceAll(word, "<WORD>" + word + "</WORD>");
217         }
218
219         lowerCaseString = lowerCaseString.toLowerCase();
220
221         //if (true) return "<excerpt>" + lowerCaseString + "</excerpt>";
222
String JavaDoc result = "";
223
224         int sourceIndex = 0;
225         int index = 0;
226         String JavaDoc[] tags = { "<word>", "</word>" };
227
228         while (lowerCaseString.indexOf(tags[0], index) != -1) {
229             for (int tag = 0; tag < 2; tag++) {
230                 int subStringLength = lowerCaseString.indexOf(tags[tag], index) - index;
231                 String JavaDoc subString = string.substring(sourceIndex, sourceIndex + subStringLength);
232                 result += (includeInCDATA(subString) + tags[tag]);
233                 sourceIndex += subStringLength;
234                 index += (subStringLength + tags[tag].length());
235             }
236         }
237
238         result += includeInCDATA(string.substring(sourceIndex));
239
240         return "<excerpt>" + result + "</excerpt>";
241     }
242
243     /**
244      * Includes a string in CDATA delimiters.
245      */

246     protected String JavaDoc includeInCDATA(String JavaDoc string) {
247         return "<![CDATA[" + string + "]]>";
248     }
249     
250     /**
251      * reads a file and if the file is an xml file, determine its encoding
252      * @param file the file to read.
253      * (if the file is an xml file with an specified encoding, this will be overwritten)
254      * @return the contents of the file.
255      */

256     protected String JavaDoc readFileWithEncoding(File JavaDoc file) throws FileNotFoundException JavaDoc, IOException JavaDoc {
257         String JavaDoc content = readHtmlFile(file);
258         // test if the file contains xml data and extract the encoding
259
int endOfFirstTag = content.indexOf(">");
260         if(endOfFirstTag > 0 && content.charAt(endOfFirstTag-1) == '?') {
261             String JavaDoc upperLine = content.substring(0, endOfFirstTag).toUpperCase();
262             int encStart = upperLine.indexOf("ENCODING=")+10;
263             int encEnd = -1;
264
265             if (encStart > 0) {
266                 encEnd = upperLine.indexOf("\"", encStart);
267                 if (encEnd == -1) {
268                     encEnd = upperLine.indexOf("\'", encStart);
269                 }
270             }
271             if(encStart > 0 && encEnd > 0) {
272                 String JavaDoc xmlCharset = upperLine.substring(encStart, encEnd);
273                 try {
274                     if (Charset.isSupported(xmlCharset)) {
275                         content = readFile(file, Charset.forName(xmlCharset));
276                     }
277                 } catch (IllegalCharsetNameException JavaDoc e) {
278                     // do nothing - thrown by Charset.isSupported
279
}
280             }
281         }
282         return content;
283     }
284     
285     
286     /**
287      * read a html file.
288      * @param file the file to read
289      * @return the content of the file.
290      * @throws FileNotFoundException if the file does not exists.
291      * @throws IOException if something else went wrong.
292      */

293     protected String JavaDoc readHtmlFile(File JavaDoc file) throws FileNotFoundException JavaDoc, IOException JavaDoc {
294         java.io.Reader JavaDoc reader = new HTMLParser(file).getReader();
295         char[] chars = new char[1024];
296         int chars_read;
297         java.io.Writer JavaDoc writer = new java.io.StringWriter JavaDoc();
298
299         while ((chars_read = reader.read(chars)) > 0) {
300             writer.write(chars, 0, chars_read);
301         }
302         return writer.toString();
303     }
304     
305     /**
306      * reads a file in the specified encoding.
307      * @param file the file to read.
308      * @param encoding the file encoding
309      * @return the content of the file.
310      * @throws FileNotFoundException if the file does not exists.
311      * @throws IOException if something else went wrong.
312      */

313     protected String JavaDoc readFile(File JavaDoc file, Charset JavaDoc charset) throws FileNotFoundException JavaDoc, IOException JavaDoc {
314         FileInputStream JavaDoc inputFile = new FileInputStream JavaDoc(file);
315         InputStreamReader JavaDoc inputStream;
316         if(charset != null) {
317             inputStream = new InputStreamReader JavaDoc(inputFile, charset);
318         } else {
319             inputStream = new InputStreamReader JavaDoc(inputFile);
320         }
321         BufferedReader JavaDoc bufferReader = new BufferedReader JavaDoc(inputStream);
322         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
323         String JavaDoc line = "";
324         while (bufferReader.ready()) {
325             line = bufferReader.readLine();
326             buffer.append(line);
327         }
328         bufferReader.close();
329         inputStream.close();
330         inputFile.close();
331         return buffer.toString();
332     }
333
334     /**
335      * Set offset
336      */

337     public void setOffset(int offset) {
338         this.offset = offset;
339     }
340 }
341
Popular Tags