Indexer


1   /*
2    * Copyright 2004-2006 H2 Group. Licensed under the H2 License, Version 1.0 (http://h2database.com/html/license.html).
3    * Initial Developer: H2 Group
4    */
5   package org.h2.tools.indexer;
6   
7   import java.io.File  ;
8   import java.io.FileInputStream  ;
9   import java.io.FileWriter  ;
10  import java.io.PrintWriter  ;
11  import java.util.ArrayList  ;
12  import java.util.Collections  ;
13  import java.util.Comparator  ;
14  import java.util.HashMap  ;
15  import java.util.HashSet  ;
16  import java.util.StringTokenizer  ;
17  
18  import org.h2.util.IOUtils;
19  import org.h2.util.StringUtils;
20  
21  public class Indexer {
22      
23      ArrayList   pages = new ArrayList  ();
24      HashMap   words = new HashMap  ();
25      HashSet   noIndex = new HashSet  ();
26      ArrayList   wordList;
27      int totalAllWeights;
28      PrintWriter   output;
29      
30      Page page;
31      boolean title;
32      boolean heading;
33      
34      private static final int MIN_WORDSIZE = 3;
35      private static final int MAX_RELATIONS = 20;
36  
37      public static void main(String  [] args) throws Exception   {
38          String   dir = "docs";
39          String   destDir = "docs/html";
40          for(int i=0; i<args.length; i++) {
41              if(args[i].equals("-dir")) {
42                  dir = args[++i];
43              } else if(args[i].equals("-destDir")) {
44                  destDir = args[++i];
45              }
46          }
47          Indexer app = new Indexer();
48          File   file = new File  (dir);
49          System.out.println("indexing " + file.getCanonicalPath());
50  //        File file = new File("C:\\Programs\\selfhtml81");
51  //      File file = new File("C:\\Programme\\selfhtml81");
52          app.setNoIndex(new String  [] {
53                  "index.html",
54                  "html/header.html",
55                  "html/search.html",
56                  "html/frame.html",
57                  "javadoc/index.html",
58                  "javadoc/classes.html",
59                  "javadoc/allclasses-frame.html",
60                  "javadoc/allclasses-noframe.html",
61                  "javadoc/constant-values.html",
62                  "javadoc/overview-frame.html",
63                  "javadoc/overview-summary.html",
64                  "javadoc/serialized-form.html"
65          });
66          app.output = new PrintWriter  (new FileWriter  (destDir + "/index.js"));
67          app.readPages("", file, 0);
68          app.output.println("var pages=new Array();");
69          app.output.println("var ref=new Array();");
70          app.output.println("function Page(title, file) { this.title=title; this.file=file; }");
71          app.output.println("function load() {");
72          app.sortWords();
73          app.removeOverflowRelations();
74          app.sortPages();
75          app.listPages();
76          app.listWords();
77          app.output.println("}");
78          app.output.close();
79      }
80      
81      private void setNoIndex(String  [] strings) {
82          for(int i=0; i<strings.length; i++) {
83              noIndex.add(strings[i]);
84          }
85      }
86  
87      void sortWords() {
88          wordList = new ArrayList  (words.values());
89          // TODO support ignored keywords (to shrink the index)
90  //        String ignored = "";
91  //        for(int i=0; i<wordList.size(); i++) {
92  //            Word word = (Word) wordList.get(i);
93  //            if(word.pages.size() >= pages.size()/4) {
94  //                wordList.remove(i);
95  //                if(ignored.length()==0) {
96  //                    ignored += ",";
97  //                }
98  //                ignored += word.name;
99  //                i--;
100 //            }
101 //        }
102 //        output.println("var ignored = '" + convertUTF(ignored) + "'");
103         // TODO support A, B, C,... class links in the index file and use them for combined AND searches
104         Collections.sort(wordList, new Comparator  () {
105             public int compare(Object   o0, Object   o1) {
106                 Word w0 = (Word) o0;
107                 Word w1 = (Word) o1;
108                 return w0.name.compareToIgnoreCase(w1.name);
109             }
110         });
111     }
112     
113     void removeOverflowRelations() {
114         for(int i=0; i<wordList.size(); i++) {
115             Word word = (Word) wordList.get(i);
116             ArrayList   weights = word.getSortedWeights();
117             int max = MAX_RELATIONS;
118             if(weights.size() > max) {
119                 while(max < weights.size()) {
120                     Weight weight = (Weight) weights.get(max);
121                     if(weight.value < Weight.HEADER) {
122                         break;
123                     }
124                     max++;
125                 }
126             }
127             while(max < weights.size()) {
128                 Weight weight = (Weight) weights.get(max);
129                 weights.remove(max);
130                 weight.page.relations--;
131             }
132         }
133     }
134     
135     void sortPages() {
136         Collections.sort(pages, new Comparator  () {
137             public int compare(Object   o0, Object   o1) {
138                 Page p0 = (Page) o0;
139                 Page p1 = (Page) o1;
140                 return p0.relations == p1.relations ? 0 : p0.relations < p1.relations ? 1 : -1;
141             }
142         });
143         for(int i=0; i<pages.size(); i++) {
144             Page page = (Page) pages.get(i);
145             page.id = i;
146         }
147     }    
148     
149     void listPages() {
150         for(int i=0; i<pages.size(); i++) {
151             Page page = (Page) pages.get(i);
152             output.println("pages["+page.id+"]=new Page('" + convertUTF(page.title)+"', '" + page.fileName+"');");
153         }
154     }
155 
156     void readPages(String   dir, File   file, int level) throws Exception   {
157         String   name = file.getName();
158         String   fileName = dir.length() > 0 ? dir + "/" + name : level > 0 ? name : "";
159         if (file.isDirectory()) {
160             File  [] list = file.listFiles();
161             for (int i = 0; i < list.length; i++) {
162                 readPages(fileName, list[i], level + 1);
163             }
164             return;
165         }
166         String   lower = StringUtils.toLowerEnglish(name);
167         if (!lower.endsWith(".html") && !lower.endsWith(".htm")) {
168             return;
169         }
170         if(!noIndex.contains(fileName)) {
171             page = new Page(pages.size(), fileName);
172             pages.add(page);
173             readPage(file);
174         }
175     }
176     
177     void listWords() {
178         output.println("// words: " + wordList.size());
179         StringBuffer   buff = new StringBuffer  ();
180         String   first = "";
181         int firstLen = 1;
182         int totalRelations = 0;
183         for(int i=0; i<wordList.size(); i++) {
184             Word word = (Word) wordList.get(i);
185             ArrayList   weights = word.getSortedWeights();
186             String   lower = StringUtils.toLowerEnglish(word.name);        
187             if(!first.equals(lower.substring(0, firstLen))) {
188                 if(buff.length()>0) {
189                     output.println("ref['"+convertUTF(first)+"']='"+buff.toString()+"';");
190                     buff = new StringBuffer  ();
191                 }
192                 first = lower.substring(0, firstLen);
193             }
194             if(buff.length()>0) {
195                 buff.append(';');
196             }
197             buff.append(convertUTF(word.name));
198             buff.append('=');
199             String   weightString="r";
200             totalRelations += weights.size();
201             for(int j=0; j<weights.size(); j++) {
202                 Weight weight = (Weight) weights.get(j);
203                 Page page = weight.page;
204                 if(j > 0) {
205                     buff.append(",");
206                 }
207                 String   ws;
208                 if(weight.value >= Weight.TITLE) {
209                     ws = "t";
210                 } else if(weight.value >= Weight.HEADER) {
211                     ws = "h";
212                 } else {
213                     ws = "r";
214                 }
215                 if(ws != weightString) {
216                     weightString = ws;
217                     buff.append(ws);
218                 }
219                 buff.append(page.id);
220                 // TODO compress weight
221 //                buff.append(",");
222 //                buff.append(weight.value);
223             }
224         }
225         // TODO optimization: could support "a name=" and go to _first_ occurance, or scan page and mark
226         output.println("ref['"+convertUTF(first)+"']='"+buff.toString()+"';");
227         output.println("// totalRelations: "+totalRelations);
228     }
229     
230     private void readPage(File   file) throws Exception   {
231         byte[] data = IOUtils.readBytesAndClose(new FileInputStream  (file), 0);
232         String   text = new String  (data, "UTF-8");
233         StringTokenizer   t = new StringTokenizer  (text, "<> \r\n", true);
234         boolean inTag = false;
235         title = false;
236         heading = false;
237         while (t.hasMoreTokens()) {
238             String   token = t.nextToken();
239             if(token.length()==1) {
240                 char c = token.charAt(0);
241                 switch(c) {
242                 case '<': {
243                     if(inTag) {
244                         process("???");
245                     }
246                     inTag = true;
247                     if(!t.hasMoreTokens()) {
248                         break;
249                     }
250                     token = t.nextToken();
251                     if(token.startsWith("/")) {
252                         title = false;
253                         heading = false;
254                     } else if(token.equalsIgnoreCase("title")) {
255                         title = true;
256                     } else if(token.length() == 2 && Character.toLowerCase(token.charAt(0))=='h' && Character.isDigit(token.charAt(1))) {
257                         heading = true;
258                     }
259                     // TODO maybe skip script tags?
260                     break;
261                 }
262                 case '>': {
263                     if(!inTag) {
264                         process("???");
265                     }
266                     inTag = false;
267                     break;
268                 }
269                 case '\r':
270                 case '\n':
271                 case ' ':
272                     break;
273                 default:
274                     if(!inTag) {
275                         process(token);
276                     }
277                 }
278             } else {
279                 if(!inTag) {
280                     process(token);
281                 }
282             }
283         }
284         
285         if (page.title == null || page.title.trim().length() == 0) {
286             System.out.println("Error: not title found in " + file.getName());
287             page.title = file.getName();
288         }
289         page.title = page.title.trim();
290     }
291     
292     void process(String   text) {
293         text = HtmlConverter.convertHtml(text);
294         if(title) {
295             if(page.title == null) {
296                 page.title = text;
297             } else {
298                 page.title = page.title + " " + text;
299             }
300         }
301         int weight;
302         if(title) {
303             weight = Weight.TITLE;
304         } else if(heading) {
305             weight = Weight.HEADER;
306         } else {
307             weight = Weight.PARAGRAPH;
308         }
309         // this list of constants needs to be the same in search.js
310         StringTokenizer   t = new StringTokenizer  (text, " \t\r\n\"'.,:;!&/\\?%@`[]{}()+-=<>|*^~#$" + 
311                 (char)160, // nbsp 
312                 false);
313         while (t.hasMoreTokens()) {
314             String   token = t.nextToken();
315             if(token.length()<MIN_WORDSIZE) {
316                 continue;
317             }
318             if(Character.isDigit(token.charAt(0))) {
319                 continue;
320             }
321             String   lower = StringUtils.toLowerEnglish(token);
322             Word word = (Word)words.get(lower);
323             if(word == null) {
324                 word = new Word(token);
325                 words.put(lower, word);
326             } else if(!word.name.equals(token)) {
327                 word.name = token.compareTo(word.name) > 0 ? token : word.name;
328             }
329             page.totalWeight += weight;
330             totalAllWeights += weight;
331             word.addPage(page, weight);
332         }
333     }    
334 
335     String   convertUTF(String   s) {
336         s = StringUtils.quoteJavaString(s);
337         s = s.substring(1, s.length()-1);
338         return s;
339     }
340 
341 }
342
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags