Crawler


1   /*
2     Copyright (C) 2003  Know Gate S.L. All rights reserved.
3                         C/O�a, 107 1�2 28050 Madrid (Spain)
4   
5     Redistribution and use in source and binary forms, with or without
6     modification, are permitted provided that the following conditions
7     are met:
8   
9     1. Redistributions of source code must retain the above copyright
10       notice, this list of conditions and the following disclaimer.
11  
12    2. The end-user documentation included with the redistribution,
13       if any, must include the following acknowledgment:
14       "This product includes software parts from hipergate
15       (http://www.hipergate.org/)."
16       Alternately, this acknowledgment may appear in the software itself,
17       if and wherever such third-party acknowledgments normally appear.
18  
19    3. The name hipergate must not be used to endorse or promote products
20       derived from this software without prior written permission.
21       Products derived from this software may not be called hipergate,
22       nor may hipergate appear in their name, without prior written
23       permission.
24  
25    This library is distributed in the hope that it will be useful,
26    but WITHOUT ANY WARRANTY; without even the implied warranty of
27    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
28  
29    You should have received a copy of hipergate License with this code;
30    if not, visit http://www.hipergate.org or mail to info@hipergate.org
31  */
32  
33  package com.knowgate.lucene;
34  
35  import java.util.Properties  ;
36  
37  import java.io.FileNotFoundException  ;
38  import java.io.IOException  ;
39  import java.io.File  ;
40  import java.io.FilenameFilter  ;
41  import java.io.FileReader  ;
42  import java.io.FileInputStream  ;
43  
44  import org.apache.lucene.analysis.*;
45  import org.apache.lucene.index.*;
46  import org.apache.lucene.document.*;
47  
48  import org.apache.oro.text.regex.*;
49  
50  import com.knowgate.debug.DebugFile;
51  
52  /**
53   * <p>Simple HTML crawler for Lucene</p>
54   * @author Sergio Montoro Ten
55   * @version 1.0
56   * @see http://jakarta.apache.org/lucene/docs/index.html
57   */
58  
59  public class Crawler {
60  
61    class RegExpFilter implements FilenameFilter   {
62  
63      private Pattern oPattern;
64      private PatternMatcher oMatcher;
65      private PatternCompiler oCompiler;
66  
67      RegExpFilter (String   sPattern) throws MalformedPatternException {
68        oMatcher = new Perl5Matcher();
69        oCompiler = new Perl5Compiler();
70        oPattern = oCompiler.compile(sPattern);
71      }
72  
73      public boolean accept(File   oFile, String   sName) {
74        return oFile.isDirectory() || oMatcher.matches(sName, oPattern);
75      }
76    } // RegExpFilter
77  
78    // ---------------------------------------------------------------------------
79    // Private Variables
80  
81    private String   sSeparator;
82    private PatternMatcher oMatcher;
83    private PatternCompiler oCompiler;
84    private Pattern oTagPattern;
85  
86    // ---------------------------------------------------------------------------
87  
88    public Crawler() {
89      oMatcher = new Perl5Matcher();
90      oCompiler = new Perl5Compiler();
91  
92      try {
93        oTagPattern = oCompiler.compile("<[^>]*>");
94      }
95      catch (MalformedPatternException mpe) { }
96  
97      sSeparator = System.getProperty("file.separator");
98    }
99    // ---------------------------------------------------------------------------
100 
101   private Document makeHTMLDocument (String   sRelativePath, String   sName, String   sHTMLText) {
102     int iTitleStart, iTitleEnd;
103 
104     if (DebugFile.trace) DebugFile.writeln("Crawler.addHTMLDocument(" + sRelativePath + "," + sName + ")");
105 
106     iTitleStart = sHTMLText.indexOf("<TITLE>");
107     if (iTitleStart<0) iTitleStart = sHTMLText.indexOf("<title>");
108 
109     if (iTitleStart>=0) {
110       iTitleEnd = sHTMLText.indexOf("</TITLE>");
111       if (iTitleEnd<0) iTitleEnd = sHTMLText.indexOf("</title>");
112     }
113     else
114       iTitleEnd = -1;
115 
116     String   sTitle;
117 
118     if (iTitleStart>=0 && iTitleEnd>0)
119 
120       sTitle = sHTMLText.substring (iTitleStart+7, iTitleEnd).trim();
121 
122     else {
123 
124       sTitle = null;
125 
126       // ***************************************************************
127       // C�digo �apa para indexar las listas de correo waltrappa de Iv�n
128 
129       iTitleStart = sHTMLText.indexOf("<H1>");
130       if (iTitleStart<0) iTitleStart = sHTMLText.indexOf("<h1>");
131 
132       if (iTitleStart>=0) {
133         iTitleEnd = sHTMLText.indexOf("</H1>");
134         if (iTitleEnd<0) iTitleEnd = sHTMLText.indexOf("</h1>");
135       }
136 
137       if (iTitleStart>=0 && iTitleEnd>0)
138         sTitle = sHTMLText.substring (iTitleStart+4, iTitleEnd).trim();
139 
140       iTitleStart = sHTMLText.indexOf("<H2>");
141       if (iTitleStart<0) iTitleStart = sHTMLText.indexOf("<h2>");
142 
143       if (iTitleStart>=0) {
144         iTitleEnd = sHTMLText.indexOf("</H2>");
145         if (iTitleEnd<0) iTitleEnd = sHTMLText.indexOf("</h2>");
146       }
147 
148       if (iTitleStart>=0 && iTitleEnd>0)
149         if (null==sTitle)
150           sTitle = sHTMLText.substring (iTitleStart+4, iTitleEnd).trim();
151         else
152           sTitle += " " + sHTMLText.substring (iTitleStart+4, iTitleEnd).trim();
153 
154       // Fin de �apa
155       // ***************************************************************
156 
157       if (sTitle==null) sTitle = "untitled";
158     }
159 
160     Document oDoc = new Document();
161 
162     oDoc.add (new Field("subpath", sRelativePath, true, false, false));
163     oDoc.add (new Field("name", sName, true, false, false));
164     oDoc.add (Field.Keyword("title", sTitle));
165     oDoc.add (Field.UnStored("text" , Util.substitute(oMatcher, oTagPattern, new StringSubstitution(""), sHTMLText, Util.SUBSTITUTE_ALL)));
166 
167     return oDoc;
168   } // makeHTMLDocument
169 
170   // ---------------------------------------------------------------------------
171 
172   private void crawlDir (IndexWriter oIWrt, String   sBasePath, int iBasePathlen, RegExpFilter oFileFilter)
173     throws IOException  , FileNotFoundException   {
174 
175     if (DebugFile.trace) {
176       DebugFile.writeln("Begin Crawler.crawlDir(" + sBasePath + ")");
177       DebugFile.incIdent();
178     }
179 
180     File   oBaseDir = new File  (sBasePath);
181     String   sName;
182 
183     if (!oBaseDir.exists())
184       throw new FileNotFoundException   (sBasePath + " directory does not exist");
185 
186     if (!oBaseDir.isDirectory())
187       throw new IOException   (sBasePath + " is not a directory");
188 
189     File  [] aFiles = oBaseDir.listFiles();
190     int iFiles = aFiles.length;
191 
192     int iBuffer;
193     char[] aBuffer;
194     String   sBuffer;
195     String   sText;
196     Document oDoc;
197 
198     sBasePath += sSeparator;
199 
200     for (int f=0; f<iFiles; f++) {
201 
202       if (aFiles[f].isDirectory()) {
203 
204         crawlDir ( oIWrt, sBasePath + aFiles[f].getName(), iBasePathlen, oFileFilter);
205       }
206 
207       else {
208 
209         sName = aFiles[f].getName().toLowerCase();
210 
211         if (sName.endsWith(".htm") || sName.endsWith(".html") || sName.endsWith(".shtml") || sName.endsWith(".shtm")) {
212           iBuffer = new Long  (aFiles[f].length()).intValue();
213 
214           if (iBuffer>0) {
215             FileReader   oReader = new FileReader  (aFiles[f]);
216             aBuffer = new char[iBuffer];
217             oReader.read(aBuffer);
218             sBuffer = new String  (aBuffer);
219 
220             oIWrt.addDocument ( makeHTMLDocument(sBasePath.substring(iBasePathlen), aFiles[f].getName(), sBuffer) );
221           } // fi (iBuffer>0)
222         } // fi (sName.endsWith(".htm") || sName.endsWith(".html"))
223       }
224     } // next
225 
226     if (DebugFile.trace) {
227       DebugFile.decIdent();
228       DebugFile.writeln("End Crawler.crawlDir()");
229     }
230   } // crawlDir
231 
232   // ---------------------------------------------------------------------------
233 
234   /**
235    * <p>Add contents to a Lucene Index
236    * @param sBasePath Base Path for crawling
237    * @param sFileFilter Perl5 Regular Expression filter for file names
238    * @param sIndexDirectory Lucene index target directory
239    * @param bRebuild <b>true</b> if index must be deleted and fully rebuild.
240    * @throws IOException
241    * @throws FileNotFoundException If sBasePath direcory does not exist
242    * @throws MalformedPatternException If sFileFilter is not a valid Perl5 regular expression pattern
243    */
244   public void crawl (String   sBasePath, String   sFileFilter, String   sIndexDirectory, boolean bRebuild)
245     throws IOException  , MalformedPatternException  {
246 
247     if (DebugFile.trace) {
248       DebugFile.writeln("Begin Crawler.crawl(" + sBasePath + "," + sFileFilter + "," + sIndexDirectory + ")");
249       DebugFile.incIdent();
250     }
251 
252     IndexWriter oIWrt = new IndexWriter(sIndexDirectory, new SimpleAnalyzer(), bRebuild);
253 
254     if (sBasePath.endsWith(sSeparator)) sBasePath = sBasePath.substring(0, sBasePath.length()-1);
255 
256     crawlDir (oIWrt, sBasePath, sBasePath.length(), new RegExpFilter(sFileFilter));
257 
258     oIWrt.optimize();
259     oIWrt.close();
260 
261     if (DebugFile.trace) {
262       DebugFile.decIdent();
263       DebugFile.writeln("End Crawler.crawl()");
264     }
265   } // crawl
266 
267   // ---------------------------------------------------------------------------
268 
269   private static void printUsage() {
270     System.out.println("");
271     System.out.println("Usage:");
272     System.out.println("Crawler cnf_path rebuild index_name base_path");
273   }
274 
275   // ---------------------------------------------------------------------------
276 
277   public static void main(String  [] argv)
278     throws NoSuchFieldException  , IOException  , FileNotFoundException  , MalformedPatternException {
279 
280     if (argv.length!=4)
281       printUsage();
282     else if (!argv[1].equals("rebuild")) {
283       printUsage();
284     }
285     else {
286       Properties   oProps = new Properties  ();
287       FileInputStream   oCNF = new FileInputStream  (argv[0]);
288       oProps.load(oCNF);
289       oCNF.close();
290 
291       String   sDirectory = oProps.getProperty("luceneindex");
292 
293       if (null==sDirectory)
294         throw new NoSuchFieldException   ("Cannot find luceneindex property");
295 
296       if (!sDirectory.endsWith(System.getProperty("file.separator")))
297         sDirectory += System.getProperty("file.separator");
298 
299       new Crawler().crawl (argv[3], ".*htm*$", sDirectory + argv[2], true);
300     }
301   } // main
302 
303 } // Crawler
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags