1 5 package org.h2.tools.indexer; 6 7 import java.io.File ; 8 import java.io.FileInputStream ; 9 import java.io.FileWriter ; 10 import java.io.PrintWriter ; 11 import java.util.ArrayList ; 12 import java.util.Collections ; 13 import java.util.Comparator ; 14 import java.util.HashMap ; 15 import java.util.HashSet ; 16 import java.util.StringTokenizer ; 17 18 import org.h2.util.IOUtils; 19 import org.h2.util.StringUtils; 20 21 public class Indexer { 22 23 ArrayList pages = new ArrayList (); 24 HashMap words = new HashMap (); 25 HashSet noIndex = new HashSet (); 26 ArrayList wordList; 27 int totalAllWeights; 28 PrintWriter output; 29 30 Page page; 31 boolean title; 32 boolean heading; 33 34 private static final int MIN_WORDSIZE = 3; 35 private static final int MAX_RELATIONS = 20; 36 37 public static void main(String [] args) throws Exception { 38 String dir = "docs"; 39 String destDir = "docs/html"; 40 for(int i=0; i<args.length; i++) { 41 if(args[i].equals("-dir")) { 42 dir = args[++i]; 43 } else if(args[i].equals("-destDir")) { 44 destDir = args[++i]; 45 } 46 } 47 Indexer app = new Indexer(); 48 File file = new File (dir); 49 System.out.println("indexing " + file.getCanonicalPath()); 50 app.setNoIndex(new String [] { 53 "index.html", 54 "html/header.html", 55 "html/search.html", 56 "html/frame.html", 57 "javadoc/index.html", 58 "javadoc/classes.html", 59 "javadoc/allclasses-frame.html", 60 "javadoc/allclasses-noframe.html", 61 "javadoc/constant-values.html", 62 "javadoc/overview-frame.html", 63 "javadoc/overview-summary.html", 64 "javadoc/serialized-form.html" 65 }); 66 app.output = new PrintWriter (new FileWriter (destDir + "/index.js")); 67 app.readPages("", file, 0); 68 app.output.println("var pages=new Array();"); 69 app.output.println("var ref=new Array();"); 70 app.output.println("function Page(title, file) { this.title=title; this.file=file; }"); 71 app.output.println("function load() {"); 72 app.sortWords(); 73 app.removeOverflowRelations(); 74 app.sortPages(); 75 app.listPages(); 76 app.listWords(); 77 app.output.println("}"); 78 app.output.close(); 79 } 80 81 private void setNoIndex(String [] strings) { 82 for(int i=0; i<strings.length; i++) { 83 noIndex.add(strings[i]); 84 } 85 } 86 87 void sortWords() { 88 wordList = new ArrayList (words.values()); 89 Collections.sort(wordList, new Comparator () { 105 public int compare(Object o0, Object o1) { 106 Word w0 = (Word) o0; 107 Word w1 = (Word) o1; 108 return w0.name.compareToIgnoreCase(w1.name); 109 } 110 }); 111 } 112 113 void removeOverflowRelations() { 114 for(int i=0; i<wordList.size(); i++) { 115 Word word = (Word) wordList.get(i); 116 ArrayList weights = word.getSortedWeights(); 117 int max = MAX_RELATIONS; 118 if(weights.size() > max) { 119 while(max < weights.size()) { 120 Weight weight = (Weight) weights.get(max); 121 if(weight.value < Weight.HEADER) { 122 break; 123 } 124 max++; 125 } 126 } 127 while(max < weights.size()) { 128 Weight weight = (Weight) weights.get(max); 129 weights.remove(max); 130 weight.page.relations--; 131 } 132 } 133 } 134 135 void sortPages() { 136 Collections.sort(pages, new Comparator () { 137 public int compare(Object o0, Object o1) { 138 Page p0 = (Page) o0; 139 Page p1 = (Page) o1; 140 return p0.relations == p1.relations ? 0 : p0.relations < p1.relations ? 1 : -1; 141 } 142 }); 143 for(int i=0; i<pages.size(); i++) { 144 Page page = (Page) pages.get(i); 145 page.id = i; 146 } 147 } 148 149 void listPages() { 150 for(int i=0; i<pages.size(); i++) { 151 Page page = (Page) pages.get(i); 152 output.println("pages["+page.id+"]=new Page('" + convertUTF(page.title)+"', '" + page.fileName+"');"); 153 } 154 } 155 156 void readPages(String dir, File file, int level) throws Exception { 157 String name = file.getName(); 158 String fileName = dir.length() > 0 ? dir + "/" + name : level > 0 ? name : ""; 159 if (file.isDirectory()) { 160 File [] list = file.listFiles(); 161 for (int i = 0; i < list.length; i++) { 162 readPages(fileName, list[i], level + 1); 163 } 164 return; 165 } 166 String lower = StringUtils.toLowerEnglish(name); 167 if (!lower.endsWith(".html") && !lower.endsWith(".htm")) { 168 return; 169 } 170 if(!noIndex.contains(fileName)) { 171 page = new Page(pages.size(), fileName); 172 pages.add(page); 173 readPage(file); 174 } 175 } 176 177 void listWords() { 178 output.println("// words: " + wordList.size()); 179 StringBuffer buff = new StringBuffer (); 180 String first = ""; 181 int firstLen = 1; 182 int totalRelations = 0; 183 for(int i=0; i<wordList.size(); i++) { 184 Word word = (Word) wordList.get(i); 185 ArrayList weights = word.getSortedWeights(); 186 String lower = StringUtils.toLowerEnglish(word.name); 187 if(!first.equals(lower.substring(0, firstLen))) { 188 if(buff.length()>0) { 189 output.println("ref['"+convertUTF(first)+"']='"+buff.toString()+"';"); 190 buff = new StringBuffer (); 191 } 192 first = lower.substring(0, firstLen); 193 } 194 if(buff.length()>0) { 195 buff.append(';'); 196 } 197 buff.append(convertUTF(word.name)); 198 buff.append('='); 199 String weightString="r"; 200 totalRelations += weights.size(); 201 for(int j=0; j<weights.size(); j++) { 202 Weight weight = (Weight) weights.get(j); 203 Page page = weight.page; 204 if(j > 0) { 205 buff.append(","); 206 } 207 String ws; 208 if(weight.value >= Weight.TITLE) { 209 ws = "t"; 210 } else if(weight.value >= Weight.HEADER) { 211 ws = "h"; 212 } else { 213 ws = "r"; 214 } 215 if(ws != weightString) { 216 weightString = ws; 217 buff.append(ws); 218 } 219 buff.append(page.id); 220 } 224 } 225 output.println("ref['"+convertUTF(first)+"']='"+buff.toString()+"';"); 227 output.println("// totalRelations: "+totalRelations); 228 } 229 230 private void readPage(File file) throws Exception { 231 byte[] data = IOUtils.readBytesAndClose(new FileInputStream (file), 0); 232 String text = new String (data, "UTF-8"); 233 StringTokenizer t = new StringTokenizer (text, "<> \r\n", true); 234 boolean inTag = false; 235 title = false; 236 heading = false; 237 while (t.hasMoreTokens()) { 238 String token = t.nextToken(); 239 if(token.length()==1) { 240 char c = token.charAt(0); 241 switch(c) { 242 case '<': { 243 if(inTag) { 244 process("???"); 245 } 246 inTag = true; 247 if(!t.hasMoreTokens()) { 248 break; 249 } 250 token = t.nextToken(); 251 if(token.startsWith("/")) { 252 title = false; 253 heading = false; 254 } else if(token.equalsIgnoreCase("title")) { 255 title = true; 256 } else if(token.length() == 2 && Character.toLowerCase(token.charAt(0))=='h' && Character.isDigit(token.charAt(1))) { 257 heading = true; 258 } 259 break; 261 } 262 case '>': { 263 if(!inTag) { 264 process("???"); 265 } 266 inTag = false; 267 break; 268 } 269 case '\r': 270 case '\n': 271 case ' ': 272 break; 273 default: 274 if(!inTag) { 275 process(token); 276 } 277 } 278 } else { 279 if(!inTag) { 280 process(token); 281 } 282 } 283 } 284 285 if (page.title == null || page.title.trim().length() == 0) { 286 System.out.println("Error: not title found in " + file.getName()); 287 page.title = file.getName(); 288 } 289 page.title = page.title.trim(); 290 } 291 292 void process(String text) { 293 text = HtmlConverter.convertHtml(text); 294 if(title) { 295 if(page.title == null) { 296 page.title = text; 297 } else { 298 page.title = page.title + " " + text; 299 } 300 } 301 int weight; 302 if(title) { 303 weight = Weight.TITLE; 304 } else if(heading) { 305 weight = Weight.HEADER; 306 } else { 307 weight = Weight.PARAGRAPH; 308 } 309 StringTokenizer t = new StringTokenizer (text, " \t\r\n\"'.,:;!&/\\?%@`[]{}()+-=<>|*^~#$" + 311 (char)160, false); 313 while (t.hasMoreTokens()) { 314 String token = t.nextToken(); 315 if(token.length()<MIN_WORDSIZE) { 316 continue; 317 } 318 if(Character.isDigit(token.charAt(0))) { 319 continue; 320 } 321 String lower = StringUtils.toLowerEnglish(token); 322 Word word = (Word)words.get(lower); 323 if(word == null) { 324 word = new Word(token); 325 words.put(lower, word); 326 } else if(!word.name.equals(token)) { 327 word.name = token.compareTo(word.name) > 0 ? token : word.name; 328 } 329 page.totalWeight += weight; 330 totalAllWeights += weight; 331 word.addPage(page, weight); 332 } 333 } 334 335 String convertUTF(String s) { 336 s = StringUtils.quoteJavaString(s); 337 s = s.substring(1, s.length()-1); 338 return s; 339 } 340 341 } 342 | Popular Tags |