1 32 33 package com.knowgate.lucene; 34 35 import java.util.Properties ; 36 37 import java.io.FileNotFoundException ; 38 import java.io.IOException ; 39 import java.io.File ; 40 import java.io.FilenameFilter ; 41 import java.io.FileReader ; 42 import java.io.FileInputStream ; 43 44 import org.apache.lucene.analysis.*; 45 import org.apache.lucene.index.*; 46 import org.apache.lucene.document.*; 47 48 import org.apache.oro.text.regex.*; 49 50 import com.knowgate.debug.DebugFile; 51 52 58 59 public class Crawler { 60 61 class RegExpFilter implements FilenameFilter { 62 63 private Pattern oPattern; 64 private PatternMatcher oMatcher; 65 private PatternCompiler oCompiler; 66 67 RegExpFilter (String sPattern) throws MalformedPatternException { 68 oMatcher = new Perl5Matcher(); 69 oCompiler = new Perl5Compiler(); 70 oPattern = oCompiler.compile(sPattern); 71 } 72 73 public boolean accept(File oFile, String sName) { 74 return oFile.isDirectory() || oMatcher.matches(sName, oPattern); 75 } 76 } 78 81 private String sSeparator; 82 private PatternMatcher oMatcher; 83 private PatternCompiler oCompiler; 84 private Pattern oTagPattern; 85 86 88 public Crawler() { 89 oMatcher = new Perl5Matcher(); 90 oCompiler = new Perl5Compiler(); 91 92 try { 93 oTagPattern = oCompiler.compile("<[^>]*>"); 94 } 95 catch (MalformedPatternException mpe) { } 96 97 sSeparator = System.getProperty("file.separator"); 98 } 99 101 private Document makeHTMLDocument (String sRelativePath, String sName, String sHTMLText) { 102 int iTitleStart, iTitleEnd; 103 104 if (DebugFile.trace) DebugFile.writeln("Crawler.addHTMLDocument(" + sRelativePath + "," + sName + ")"); 105 106 iTitleStart = sHTMLText.indexOf("<TITLE>"); 107 if (iTitleStart<0) iTitleStart = sHTMLText.indexOf("<title>"); 108 109 if (iTitleStart>=0) { 110 iTitleEnd = sHTMLText.indexOf("</TITLE>"); 111 if (iTitleEnd<0) iTitleEnd = sHTMLText.indexOf("</title>"); 112 } 113 else 114 iTitleEnd = -1; 115 116 String sTitle; 117 118 if (iTitleStart>=0 && iTitleEnd>0) 119 120 sTitle = sHTMLText.substring (iTitleStart+7, iTitleEnd).trim(); 121 122 else { 123 124 sTitle = null; 125 126 129 iTitleStart = sHTMLText.indexOf("<H1>"); 130 if (iTitleStart<0) iTitleStart = sHTMLText.indexOf("<h1>"); 131 132 if (iTitleStart>=0) { 133 iTitleEnd = sHTMLText.indexOf("</H1>"); 134 if (iTitleEnd<0) iTitleEnd = sHTMLText.indexOf("</h1>"); 135 } 136 137 if (iTitleStart>=0 && iTitleEnd>0) 138 sTitle = sHTMLText.substring (iTitleStart+4, iTitleEnd).trim(); 139 140 iTitleStart = sHTMLText.indexOf("<H2>"); 141 if (iTitleStart<0) iTitleStart = sHTMLText.indexOf("<h2>"); 142 143 if (iTitleStart>=0) { 144 iTitleEnd = sHTMLText.indexOf("</H2>"); 145 if (iTitleEnd<0) iTitleEnd = sHTMLText.indexOf("</h2>"); 146 } 147 148 if (iTitleStart>=0 && iTitleEnd>0) 149 if (null==sTitle) 150 sTitle = sHTMLText.substring (iTitleStart+4, iTitleEnd).trim(); 151 else 152 sTitle += " " + sHTMLText.substring (iTitleStart+4, iTitleEnd).trim(); 153 154 157 if (sTitle==null) sTitle = "untitled"; 158 } 159 160 Document oDoc = new Document(); 161 162 oDoc.add (new Field("subpath", sRelativePath, true, false, false)); 163 oDoc.add (new Field("name", sName, true, false, false)); 164 oDoc.add (Field.Keyword("title", sTitle)); 165 oDoc.add (Field.UnStored("text" , Util.substitute(oMatcher, oTagPattern, new StringSubstitution(""), sHTMLText, Util.SUBSTITUTE_ALL))); 166 167 return oDoc; 168 } 170 172 private void crawlDir (IndexWriter oIWrt, String sBasePath, int iBasePathlen, RegExpFilter oFileFilter) 173 throws IOException , FileNotFoundException { 174 175 if (DebugFile.trace) { 176 DebugFile.writeln("Begin Crawler.crawlDir(" + sBasePath + ")"); 177 DebugFile.incIdent(); 178 } 179 180 File oBaseDir = new File (sBasePath); 181 String sName; 182 183 if (!oBaseDir.exists()) 184 throw new FileNotFoundException (sBasePath + " directory does not exist"); 185 186 if (!oBaseDir.isDirectory()) 187 throw new IOException (sBasePath + " is not a directory"); 188 189 File [] aFiles = oBaseDir.listFiles(); 190 int iFiles = aFiles.length; 191 192 int iBuffer; 193 char[] aBuffer; 194 String sBuffer; 195 String sText; 196 Document oDoc; 197 198 sBasePath += sSeparator; 199 200 for (int f=0; f<iFiles; f++) { 201 202 if (aFiles[f].isDirectory()) { 203 204 crawlDir ( oIWrt, sBasePath + aFiles[f].getName(), iBasePathlen, oFileFilter); 205 } 206 207 else { 208 209 sName = aFiles[f].getName().toLowerCase(); 210 211 if (sName.endsWith(".htm") || sName.endsWith(".html") || sName.endsWith(".shtml") || sName.endsWith(".shtm")) { 212 iBuffer = new Long (aFiles[f].length()).intValue(); 213 214 if (iBuffer>0) { 215 FileReader oReader = new FileReader (aFiles[f]); 216 aBuffer = new char[iBuffer]; 217 oReader.read(aBuffer); 218 sBuffer = new String (aBuffer); 219 220 oIWrt.addDocument ( makeHTMLDocument(sBasePath.substring(iBasePathlen), aFiles[f].getName(), sBuffer) ); 221 } } } 224 } 226 if (DebugFile.trace) { 227 DebugFile.decIdent(); 228 DebugFile.writeln("End Crawler.crawlDir()"); 229 } 230 } 232 234 244 public void crawl (String sBasePath, String sFileFilter, String sIndexDirectory, boolean bRebuild) 245 throws IOException , MalformedPatternException { 246 247 if (DebugFile.trace) { 248 DebugFile.writeln("Begin Crawler.crawl(" + sBasePath + "," + sFileFilter + "," + sIndexDirectory + ")"); 249 DebugFile.incIdent(); 250 } 251 252 IndexWriter oIWrt = new IndexWriter(sIndexDirectory, new SimpleAnalyzer(), bRebuild); 253 254 if (sBasePath.endsWith(sSeparator)) sBasePath = sBasePath.substring(0, sBasePath.length()-1); 255 256 crawlDir (oIWrt, sBasePath, sBasePath.length(), new RegExpFilter(sFileFilter)); 257 258 oIWrt.optimize(); 259 oIWrt.close(); 260 261 if (DebugFile.trace) { 262 DebugFile.decIdent(); 263 DebugFile.writeln("End Crawler.crawl()"); 264 } 265 } 267 269 private static void printUsage() { 270 System.out.println(""); 271 System.out.println("Usage:"); 272 System.out.println("Crawler cnf_path rebuild index_name base_path"); 273 } 274 275 277 public static void main(String [] argv) 278 throws NoSuchFieldException , IOException , FileNotFoundException , MalformedPatternException { 279 280 if (argv.length!=4) 281 printUsage(); 282 else if (!argv[1].equals("rebuild")) { 283 printUsage(); 284 } 285 else { 286 Properties oProps = new Properties (); 287 FileInputStream oCNF = new FileInputStream (argv[0]); 288 oProps.load(oCNF); 289 oCNF.close(); 290 291 String sDirectory = oProps.getProperty("luceneindex"); 292 293 if (null==sDirectory) 294 throw new NoSuchFieldException ("Cannot find luceneindex property"); 295 296 if (!sDirectory.endsWith(System.getProperty("file.separator"))) 297 sDirectory += System.getProperty("file.separator"); 298 299 new Crawler().crawl (argv[3], ".*htm*$", sDirectory + argv[2], true); 300 } 301 } 303 } | Popular Tags |