KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > knowgate > lucene > Crawler


1 /*
2   Copyright (C) 2003 Know Gate S.L. All rights reserved.
3                       C/Oña, 107 1º2 28050 Madrid (Spain)
4
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions
7   are met:
8
9   1. Redistributions of source code must retain the above copyright
10      notice, this list of conditions and the following disclaimer.
11
12   2. The end-user documentation included with the redistribution,
13      if any, must include the following acknowledgment:
14      "This product includes software parts from hipergate
15      (http://www.hipergate.org/)."
16      Alternately, this acknowledgment may appear in the software itself,
17      if and wherever such third-party acknowledgments normally appear.
18
19   3. The name hipergate must not be used to endorse or promote products
20      derived from this software without prior written permission.
21      Products derived from this software may not be called hipergate,
22      nor may hipergate appear in their name, without prior written
23      permission.
24
25   This library is distributed in the hope that it will be useful,
26   but WITHOUT ANY WARRANTY; without even the implied warranty of
27   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
28
29   You should have received a copy of hipergate License with this code;
30   if not, visit http://www.hipergate.org or mail to info@hipergate.org
31 */

32
33 package com.knowgate.lucene;
34
35 import java.util.Properties JavaDoc;
36
37 import java.io.FileNotFoundException JavaDoc;
38 import java.io.IOException JavaDoc;
39 import java.io.File JavaDoc;
40 import java.io.FilenameFilter JavaDoc;
41 import java.io.FileReader JavaDoc;
42 import java.io.FileInputStream JavaDoc;
43
44 import org.apache.lucene.analysis.*;
45 import org.apache.lucene.index.*;
46 import org.apache.lucene.document.*;
47
48 import org.apache.oro.text.regex.*;
49
50 import com.knowgate.debug.DebugFile;
51
52 /**
53  * <p>Simple HTML crawler for Lucene</p>
54  * @author Sergio Montoro Ten
55  * @version 1.0
56  * @see http://jakarta.apache.org/lucene/docs/index.html
57  */

58
59 public class Crawler {
60
61   class RegExpFilter implements FilenameFilter JavaDoc {
62
63     private Pattern oPattern;
64     private PatternMatcher oMatcher;
65     private PatternCompiler oCompiler;
66
67     RegExpFilter (String JavaDoc sPattern) throws MalformedPatternException {
68       oMatcher = new Perl5Matcher();
69       oCompiler = new Perl5Compiler();
70       oPattern = oCompiler.compile(sPattern);
71     }
72
73     public boolean accept(File JavaDoc oFile, String JavaDoc sName) {
74       return oFile.isDirectory() || oMatcher.matches(sName, oPattern);
75     }
76   } // RegExpFilter
77

78   // ---------------------------------------------------------------------------
79
// Private Variables
80

81   private String JavaDoc sSeparator;
82   private PatternMatcher oMatcher;
83   private PatternCompiler oCompiler;
84   private Pattern oTagPattern;
85
86   // ---------------------------------------------------------------------------
87

88   public Crawler() {
89     oMatcher = new Perl5Matcher();
90     oCompiler = new Perl5Compiler();
91
92     try {
93       oTagPattern = oCompiler.compile("<[^>]*>");
94     }
95     catch (MalformedPatternException mpe) { }
96
97     sSeparator = System.getProperty("file.separator");
98   }
99   // ---------------------------------------------------------------------------
100

101   private Document makeHTMLDocument (String JavaDoc sRelativePath, String JavaDoc sName, String JavaDoc sHTMLText) {
102     int iTitleStart, iTitleEnd;
103
104     if (DebugFile.trace) DebugFile.writeln("Crawler.addHTMLDocument(" + sRelativePath + "," + sName + ")");
105
106     iTitleStart = sHTMLText.indexOf("<TITLE>");
107     if (iTitleStart<0) iTitleStart = sHTMLText.indexOf("<title>");
108
109     if (iTitleStart>=0) {
110       iTitleEnd = sHTMLText.indexOf("</TITLE>");
111       if (iTitleEnd<0) iTitleEnd = sHTMLText.indexOf("</title>");
112     }
113     else
114       iTitleEnd = -1;
115
116     String JavaDoc sTitle;
117
118     if (iTitleStart>=0 && iTitleEnd>0)
119
120       sTitle = sHTMLText.substring (iTitleStart+7, iTitleEnd).trim();
121
122     else {
123
124       sTitle = null;
125
126       // ***************************************************************
127
// Código ñapa para indexar las listas de correo waltrappa de Iván
128

129       iTitleStart = sHTMLText.indexOf("<H1>");
130       if (iTitleStart<0) iTitleStart = sHTMLText.indexOf("<h1>");
131
132       if (iTitleStart>=0) {
133         iTitleEnd = sHTMLText.indexOf("</H1>");
134         if (iTitleEnd<0) iTitleEnd = sHTMLText.indexOf("</h1>");
135       }
136
137       if (iTitleStart>=0 && iTitleEnd>0)
138         sTitle = sHTMLText.substring (iTitleStart+4, iTitleEnd).trim();
139
140       iTitleStart = sHTMLText.indexOf("<H2>");
141       if (iTitleStart<0) iTitleStart = sHTMLText.indexOf("<h2>");
142
143       if (iTitleStart>=0) {
144         iTitleEnd = sHTMLText.indexOf("</H2>");
145         if (iTitleEnd<0) iTitleEnd = sHTMLText.indexOf("</h2>");
146       }
147
148       if (iTitleStart>=0 && iTitleEnd>0)
149         if (null==sTitle)
150           sTitle = sHTMLText.substring (iTitleStart+4, iTitleEnd).trim();
151         else
152           sTitle += " " + sHTMLText.substring (iTitleStart+4, iTitleEnd).trim();
153
154       // Fin de ñapa
155
// ***************************************************************
156

157       if (sTitle==null) sTitle = "untitled";
158     }
159
160     Document oDoc = new Document();
161
162     oDoc.add (new Field("subpath", sRelativePath, true, false, false));
163     oDoc.add (new Field("name", sName, true, false, false));
164     oDoc.add (Field.Keyword("title", sTitle));
165     oDoc.add (Field.UnStored("text" , Util.substitute(oMatcher, oTagPattern, new StringSubstitution(""), sHTMLText, Util.SUBSTITUTE_ALL)));
166
167     return oDoc;
168   } // makeHTMLDocument
169

170   // ---------------------------------------------------------------------------
171

172   private void crawlDir (IndexWriter oIWrt, String JavaDoc sBasePath, int iBasePathlen, RegExpFilter oFileFilter)
173     throws IOException JavaDoc, FileNotFoundException JavaDoc {
174
175     if (DebugFile.trace) {
176       DebugFile.writeln("Begin Crawler.crawlDir(" + sBasePath + ")");
177       DebugFile.incIdent();
178     }
179
180     File JavaDoc oBaseDir = new File JavaDoc(sBasePath);
181     String JavaDoc sName;
182
183     if (!oBaseDir.exists())
184       throw new FileNotFoundException JavaDoc (sBasePath + " directory does not exist");
185
186     if (!oBaseDir.isDirectory())
187       throw new IOException JavaDoc (sBasePath + " is not a directory");
188
189     File JavaDoc[] aFiles = oBaseDir.listFiles();
190     int iFiles = aFiles.length;
191
192     int iBuffer;
193     char[] aBuffer;
194     String JavaDoc sBuffer;
195     String JavaDoc sText;
196     Document oDoc;
197
198     sBasePath += sSeparator;
199
200     for (int f=0; f<iFiles; f++) {
201
202       if (aFiles[f].isDirectory()) {
203
204         crawlDir ( oIWrt, sBasePath + aFiles[f].getName(), iBasePathlen, oFileFilter);
205       }
206
207       else {
208
209         sName = aFiles[f].getName().toLowerCase();
210
211         if (sName.endsWith(".htm") || sName.endsWith(".html") || sName.endsWith(".shtml") || sName.endsWith(".shtm")) {
212           iBuffer = new Long JavaDoc(aFiles[f].length()).intValue();
213
214           if (iBuffer>0) {
215             FileReader JavaDoc oReader = new FileReader JavaDoc(aFiles[f]);
216             aBuffer = new char[iBuffer];
217             oReader.read(aBuffer);
218             sBuffer = new String JavaDoc(aBuffer);
219
220             oIWrt.addDocument ( makeHTMLDocument(sBasePath.substring(iBasePathlen), aFiles[f].getName(), sBuffer) );
221           } // fi (iBuffer>0)
222
} // fi (sName.endsWith(".htm") || sName.endsWith(".html"))
223
}
224     } // next
225

226     if (DebugFile.trace) {
227       DebugFile.decIdent();
228       DebugFile.writeln("End Crawler.crawlDir()");
229     }
230   } // crawlDir
231

232   // ---------------------------------------------------------------------------
233

234   /**
235    * <p>Add contents to a Lucene Index
236    * @param sBasePath Base Path for crawling
237    * @param sFileFilter Perl5 Regular Expression filter for file names
238    * @param sIndexDirectory Lucene index target directory
239    * @param bRebuild <b>true</b> if index must be deleted and fully rebuild.
240    * @throws IOException
241    * @throws FileNotFoundException If sBasePath direcory does not exist
242    * @throws MalformedPatternException If sFileFilter is not a valid Perl5 regular expression pattern
243    */

244   public void crawl (String JavaDoc sBasePath, String JavaDoc sFileFilter, String JavaDoc sIndexDirectory, boolean bRebuild)
245     throws IOException JavaDoc, MalformedPatternException {
246
247     if (DebugFile.trace) {
248       DebugFile.writeln("Begin Crawler.crawl(" + sBasePath + "," + sFileFilter + "," + sIndexDirectory + ")");
249       DebugFile.incIdent();
250     }
251
252     IndexWriter oIWrt = new IndexWriter(sIndexDirectory, new SimpleAnalyzer(), bRebuild);
253
254     if (sBasePath.endsWith(sSeparator)) sBasePath = sBasePath.substring(0, sBasePath.length()-1);
255
256     crawlDir (oIWrt, sBasePath, sBasePath.length(), new RegExpFilter(sFileFilter));
257
258     oIWrt.optimize();
259     oIWrt.close();
260
261     if (DebugFile.trace) {
262       DebugFile.decIdent();
263       DebugFile.writeln("End Crawler.crawl()");
264     }
265   } // crawl
266

267   // ---------------------------------------------------------------------------
268

269   private static void printUsage() {
270     System.out.println("");
271     System.out.println("Usage:");
272     System.out.println("Crawler cnf_path rebuild index_name base_path");
273   }
274
275   // ---------------------------------------------------------------------------
276

277   public static void main(String JavaDoc[] argv)
278     throws NoSuchFieldException JavaDoc, IOException JavaDoc, FileNotFoundException JavaDoc, MalformedPatternException {
279
280     if (argv.length!=4)
281       printUsage();
282     else if (!argv[1].equals("rebuild")) {
283       printUsage();
284     }
285     else {
286       Properties JavaDoc oProps = new Properties JavaDoc();
287       FileInputStream JavaDoc oCNF = new FileInputStream JavaDoc(argv[0]);
288       oProps.load(oCNF);
289       oCNF.close();
290
291       String JavaDoc sDirectory = oProps.getProperty("luceneindex");
292
293       if (null==sDirectory)
294         throw new NoSuchFieldException JavaDoc ("Cannot find luceneindex property");
295
296       if (!sDirectory.endsWith(System.getProperty("file.separator")))
297         sDirectory += System.getProperty("file.separator");
298
299       new Crawler().crawl (argv[3], ".*htm*$", sDirectory + argv[2], true);
300     }
301   } // main
302

303 } // Crawler
Popular Tags