KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > h2 > tools > indexer > Indexer


1 /*
2  * Copyright 2004-2006 H2 Group. Licensed under the H2 License, Version 1.0 (http://h2database.com/html/license.html).
3  * Initial Developer: H2 Group
4  */

5 package org.h2.tools.indexer;
6
7 import java.io.File JavaDoc;
8 import java.io.FileInputStream JavaDoc;
9 import java.io.FileWriter JavaDoc;
10 import java.io.PrintWriter JavaDoc;
11 import java.util.ArrayList JavaDoc;
12 import java.util.Collections JavaDoc;
13 import java.util.Comparator JavaDoc;
14 import java.util.HashMap JavaDoc;
15 import java.util.HashSet JavaDoc;
16 import java.util.StringTokenizer JavaDoc;
17
18 import org.h2.util.IOUtils;
19 import org.h2.util.StringUtils;
20
21 public class Indexer {
22     
23     ArrayList JavaDoc pages = new ArrayList JavaDoc();
24     HashMap JavaDoc words = new HashMap JavaDoc();
25     HashSet JavaDoc noIndex = new HashSet JavaDoc();
26     ArrayList JavaDoc wordList;
27     int totalAllWeights;
28     PrintWriter JavaDoc output;
29     
30     Page page;
31     boolean title;
32     boolean heading;
33     
34     private static final int MIN_WORDSIZE = 3;
35     private static final int MAX_RELATIONS = 20;
36
37     public static void main(String JavaDoc[] args) throws Exception JavaDoc {
38         String JavaDoc dir = "docs";
39         String JavaDoc destDir = "docs/html";
40         for(int i=0; i<args.length; i++) {
41             if(args[i].equals("-dir")) {
42                 dir = args[++i];
43             } else if(args[i].equals("-destDir")) {
44                 destDir = args[++i];
45             }
46         }
47         Indexer app = new Indexer();
48         File JavaDoc file = new File JavaDoc(dir);
49         System.out.println("indexing " + file.getCanonicalPath());
50 // File file = new File("C:\\Programs\\selfhtml81");
51
// File file = new File("C:\\Programme\\selfhtml81");
52
app.setNoIndex(new String JavaDoc[] {
53                 "index.html",
54                 "html/header.html",
55                 "html/search.html",
56                 "html/frame.html",
57                 "javadoc/index.html",
58                 "javadoc/classes.html",
59                 "javadoc/allclasses-frame.html",
60                 "javadoc/allclasses-noframe.html",
61                 "javadoc/constant-values.html",
62                 "javadoc/overview-frame.html",
63                 "javadoc/overview-summary.html",
64                 "javadoc/serialized-form.html"
65         });
66         app.output = new PrintWriter JavaDoc(new FileWriter JavaDoc(destDir + "/index.js"));
67         app.readPages("", file, 0);
68         app.output.println("var pages=new Array();");
69         app.output.println("var ref=new Array();");
70         app.output.println("function Page(title, file) { this.title=title; this.file=file; }");
71         app.output.println("function load() {");
72         app.sortWords();
73         app.removeOverflowRelations();
74         app.sortPages();
75         app.listPages();
76         app.listWords();
77         app.output.println("}");
78         app.output.close();
79     }
80     
81     private void setNoIndex(String JavaDoc[] strings) {
82         for(int i=0; i<strings.length; i++) {
83             noIndex.add(strings[i]);
84         }
85     }
86
87     void sortWords() {
88         wordList = new ArrayList JavaDoc(words.values());
89         // TODO support ignored keywords (to shrink the index)
90
// String ignored = "";
91
// for(int i=0; i<wordList.size(); i++) {
92
// Word word = (Word) wordList.get(i);
93
// if(word.pages.size() >= pages.size()/4) {
94
// wordList.remove(i);
95
// if(ignored.length()==0) {
96
// ignored += ",";
97
// }
98
// ignored += word.name;
99
// i--;
100
// }
101
// }
102
// output.println("var ignored = '" + convertUTF(ignored) + "'");
103
// TODO support A, B, C,... class links in the index file and use them for combined AND searches
104
Collections.sort(wordList, new Comparator JavaDoc() {
105             public int compare(Object JavaDoc o0, Object JavaDoc o1) {
106                 Word w0 = (Word) o0;
107                 Word w1 = (Word) o1;
108                 return w0.name.compareToIgnoreCase(w1.name);
109             }
110         });
111     }
112     
113     void removeOverflowRelations() {
114         for(int i=0; i<wordList.size(); i++) {
115             Word word = (Word) wordList.get(i);
116             ArrayList JavaDoc weights = word.getSortedWeights();
117             int max = MAX_RELATIONS;
118             if(weights.size() > max) {
119                 while(max < weights.size()) {
120                     Weight weight = (Weight) weights.get(max);
121                     if(weight.value < Weight.HEADER) {
122                         break;
123                     }
124                     max++;
125                 }
126             }
127             while(max < weights.size()) {
128                 Weight weight = (Weight) weights.get(max);
129                 weights.remove(max);
130                 weight.page.relations--;
131             }
132         }
133     }
134     
135     void sortPages() {
136         Collections.sort(pages, new Comparator JavaDoc() {
137             public int compare(Object JavaDoc o0, Object JavaDoc o1) {
138                 Page p0 = (Page) o0;
139                 Page p1 = (Page) o1;
140                 return p0.relations == p1.relations ? 0 : p0.relations < p1.relations ? 1 : -1;
141             }
142         });
143         for(int i=0; i<pages.size(); i++) {
144             Page page = (Page) pages.get(i);
145             page.id = i;
146         }
147     }
148     
149     void listPages() {
150         for(int i=0; i<pages.size(); i++) {
151             Page page = (Page) pages.get(i);
152             output.println("pages["+page.id+"]=new Page('" + convertUTF(page.title)+"', '" + page.fileName+"');");
153         }
154     }
155
156     void readPages(String JavaDoc dir, File JavaDoc file, int level) throws Exception JavaDoc {
157         String JavaDoc name = file.getName();
158         String JavaDoc fileName = dir.length() > 0 ? dir + "/" + name : level > 0 ? name : "";
159         if (file.isDirectory()) {
160             File JavaDoc[] list = file.listFiles();
161             for (int i = 0; i < list.length; i++) {
162                 readPages(fileName, list[i], level + 1);
163             }
164             return;
165         }
166         String JavaDoc lower = StringUtils.toLowerEnglish(name);
167         if (!lower.endsWith(".html") && !lower.endsWith(".htm")) {
168             return;
169         }
170         if(!noIndex.contains(fileName)) {
171             page = new Page(pages.size(), fileName);
172             pages.add(page);
173             readPage(file);
174         }
175     }
176     
177     void listWords() {
178         output.println("// words: " + wordList.size());
179         StringBuffer JavaDoc buff = new StringBuffer JavaDoc();
180         String JavaDoc first = "";
181         int firstLen = 1;
182         int totalRelations = 0;
183         for(int i=0; i<wordList.size(); i++) {
184             Word word = (Word) wordList.get(i);
185             ArrayList JavaDoc weights = word.getSortedWeights();
186             String JavaDoc lower = StringUtils.toLowerEnglish(word.name);
187             if(!first.equals(lower.substring(0, firstLen))) {
188                 if(buff.length()>0) {
189                     output.println("ref['"+convertUTF(first)+"']='"+buff.toString()+"';");
190                     buff = new StringBuffer JavaDoc();
191                 }
192                 first = lower.substring(0, firstLen);
193             }
194             if(buff.length()>0) {
195                 buff.append(';');
196             }
197             buff.append(convertUTF(word.name));
198             buff.append('=');
199             String JavaDoc weightString="r";
200             totalRelations += weights.size();
201             for(int j=0; j<weights.size(); j++) {
202                 Weight weight = (Weight) weights.get(j);
203                 Page page = weight.page;
204                 if(j > 0) {
205                     buff.append(",");
206                 }
207                 String JavaDoc ws;
208                 if(weight.value >= Weight.TITLE) {
209                     ws = "t";
210                 } else if(weight.value >= Weight.HEADER) {
211                     ws = "h";
212                 } else {
213                     ws = "r";
214                 }
215                 if(ws != weightString) {
216                     weightString = ws;
217                     buff.append(ws);
218                 }
219                 buff.append(page.id);
220                 // TODO compress weight
221
// buff.append(",");
222
// buff.append(weight.value);
223
}
224         }
225         // TODO optimization: could support "a name=" and go to _first_ occurance, or scan page and mark
226
output.println("ref['"+convertUTF(first)+"']='"+buff.toString()+"';");
227         output.println("// totalRelations: "+totalRelations);
228     }
229     
230     private void readPage(File JavaDoc file) throws Exception JavaDoc {
231         byte[] data = IOUtils.readBytesAndClose(new FileInputStream JavaDoc(file), 0);
232         String JavaDoc text = new String JavaDoc(data, "UTF-8");
233         StringTokenizer JavaDoc t = new StringTokenizer JavaDoc(text, "<> \r\n", true);
234         boolean inTag = false;
235         title = false;
236         heading = false;
237         while (t.hasMoreTokens()) {
238             String JavaDoc token = t.nextToken();
239             if(token.length()==1) {
240                 char c = token.charAt(0);
241                 switch(c) {
242                 case '<': {
243                     if(inTag) {
244                         process("???");
245                     }
246                     inTag = true;
247                     if(!t.hasMoreTokens()) {
248                         break;
249                     }
250                     token = t.nextToken();
251                     if(token.startsWith("/")) {
252                         title = false;
253                         heading = false;
254                     } else if(token.equalsIgnoreCase("title")) {
255                         title = true;
256                     } else if(token.length() == 2 && Character.toLowerCase(token.charAt(0))=='h' && Character.isDigit(token.charAt(1))) {
257                         heading = true;
258                     }
259                     // TODO maybe skip script tags?
260
break;
261                 }
262                 case '>': {
263                     if(!inTag) {
264                         process("???");
265                     }
266                     inTag = false;
267                     break;
268                 }
269                 case '\r':
270                 case '\n':
271                 case ' ':
272                     break;
273                 default:
274                     if(!inTag) {
275                         process(token);
276                     }
277                 }
278             } else {
279                 if(!inTag) {
280                     process(token);
281                 }
282             }
283         }
284         
285         if (page.title == null || page.title.trim().length() == 0) {
286             System.out.println("Error: not title found in " + file.getName());
287             page.title = file.getName();
288         }
289         page.title = page.title.trim();
290     }
291     
292     void process(String JavaDoc text) {
293         text = HtmlConverter.convertHtml(text);
294         if(title) {
295             if(page.title == null) {
296                 page.title = text;
297             } else {
298                 page.title = page.title + " " + text;
299             }
300         }
301         int weight;
302         if(title) {
303             weight = Weight.TITLE;
304         } else if(heading) {
305             weight = Weight.HEADER;
306         } else {
307             weight = Weight.PARAGRAPH;
308         }
309         // this list of constants needs to be the same in search.js
310
StringTokenizer JavaDoc t = new StringTokenizer JavaDoc(text, " \t\r\n\"'.,:;!&/\\?%@`[]{}()+-=<>|*^~#$" +
311                 (char)160, // nbsp
312
false);
313         while (t.hasMoreTokens()) {
314             String JavaDoc token = t.nextToken();
315             if(token.length()<MIN_WORDSIZE) {
316                 continue;
317             }
318             if(Character.isDigit(token.charAt(0))) {
319                 continue;
320             }
321             String JavaDoc lower = StringUtils.toLowerEnglish(token);
322             Word word = (Word)words.get(lower);
323             if(word == null) {
324                 word = new Word(token);
325                 words.put(lower, word);
326             } else if(!word.name.equals(token)) {
327                 word.name = token.compareTo(word.name) > 0 ? token : word.name;
328             }
329             page.totalWeight += weight;
330             totalAllWeights += weight;
331             word.addPage(page, weight);
332         }
333     }
334
335     String JavaDoc convertUTF(String JavaDoc s) {
336         s = StringUtils.quoteJavaString(s);
337         s = s.substring(1, s.length()-1);
338         return s;
339     }
340
341 }
342
Popular Tags