KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > util > RecoveryLogMapper


1 /* RecoveryLogMapper.java
2 *
3 * $Id: RecoveryLogMapper.java,v 1.4.14.1 2007/01/13 01:31:30 stack-sf Exp $
4 *
5 * Created on Mar 7, 2005
6 *
7 * Copyright (C) 2005 Mike Schwartz.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25
26 /**
27  * Parses a Heritrix recovery log file (recover.gz), and builds maps
28  * that allow a caller to look up any seed URL and get back an Iterator of all
29  * URLs successfully crawled from given seed.
30  *
31  * Also allows lookup on any crawled
32  * URL to find the seed URL from which the crawler reached that URL (through 1
33  * or more discovered URL hops, which are collapsed in this lookup).
34  *
35  * <p>This code creates some fairly large collections (proprotionate in size to
36  * # discovered URLs) so make sure you allocate
37  * it a large heap to work in. It also takes a while to process a recover log.
38  * <p>See {@link #main()} method at end for test/demo code.
39  * @author Mike Schwartz, schwartz at CodeOnTheRoad dot com
40  */

41 package org.archive.crawler.util;
42
43 import org.archive.crawler.frontier.RecoveryJournal;
44
45 import java.io.File JavaDoc;
46 import java.io.LineNumberReader JavaDoc;
47 import java.io.PrintWriter JavaDoc;
48 import java.io.FileOutputStream JavaDoc;
49 import java.util.Collection JavaDoc;
50 import java.util.HashMap JavaDoc;
51 import java.util.HashSet JavaDoc;
52 import java.util.Iterator JavaDoc;
53 import java.util.Map JavaDoc;
54 import java.util.Set JavaDoc;
55 import java.util.logging.Level JavaDoc;
56 import java.util.logging.Logger JavaDoc;
57
58 public class RecoveryLogMapper {
59     private static final char LOG_LINE_START_CHAR =
60         RecoveryJournal.F_ADD.charAt(0);
61     private static final Logger JavaDoc logger =
62         Logger.getLogger(RecoveryLogMapper.class.getName());
63     private PrintWriter JavaDoc seedNotFoundPrintWriter = null;
64
65     /**
66      * Tracks seed for each crawled URL
67      */

68     private Map JavaDoc<String JavaDoc,String JavaDoc> crawledUrlToSeedMap
69      = new HashMap JavaDoc<String JavaDoc,String JavaDoc>();
70
71     /**
72      * Maps seed URLs to Set of discovered URLs
73      */

74     private Map JavaDoc<String JavaDoc,Set JavaDoc<String JavaDoc>> seedUrlToDiscoveredUrlsMap
75      = new HashMap JavaDoc<String JavaDoc,Set JavaDoc<String JavaDoc>>();
76
77     /**
78      * Tracks which URLs were successfully crawled
79      */

80     private Set JavaDoc<String JavaDoc> successfullyCrawledUrls = new HashSet JavaDoc<String JavaDoc>();
81
82      /**
83      * Normal constructor - if encounter not-found seeds while loading
84      * recoverLogFileName, will throw throw SeedUrlNotFoundException.
85      * Use {@link #RecoveryLogMapper(String)} if you want to just log
86      * such cases and keep going. (Those should not happen if the
87      * recover log is written correctly, but we see them in pratice.)
88      * @param recoverLogFileName
89      * @throws java.io.FileNotFoundException
90      * @throws java.io.IOException
91      * @throws SeedUrlNotFoundException
92      */

93     public RecoveryLogMapper(String JavaDoc recoverLogFileName)
94     throws java.io.FileNotFoundException JavaDoc, java.io.IOException JavaDoc,
95             SeedUrlNotFoundException {
96         load(recoverLogFileName);
97     }
98
99     /**
100      * Constructor to use if you want to allow not-found seeds, logging
101      * them to seedNotFoundLogFileName. In contrast, {@link
102      * #RecoveryLogMapper(String)} will throw SeedUrlNotFoundException
103      * when a seed isn't found.
104      * @param recoverLogFileName
105      * @param seedNotFoundLogFileName
106      */

107     public RecoveryLogMapper(String JavaDoc recoverLogFileName,
108                              String JavaDoc seedNotFoundLogFileName)
109         throws java.io.FileNotFoundException JavaDoc, java.io.IOException JavaDoc,
110                SeedUrlNotFoundException {
111         seedNotFoundPrintWriter = new PrintWriter JavaDoc(new FileOutputStream JavaDoc(
112                seedNotFoundLogFileName));
113         load(recoverLogFileName);
114     }
115
116     protected void load(String JavaDoc recoverLogFileName)
117     throws java.io.FileNotFoundException JavaDoc, java.io.IOException JavaDoc,
118             SeedUrlNotFoundException {
119         LineNumberReader JavaDoc reader = new LineNumberReader JavaDoc(RecoveryJournal.
120             getBufferedReader(new File JavaDoc(recoverLogFileName)));
121         String JavaDoc curLine = null;
122         while ((curLine = reader.readLine()) != null) {
123             if (curLine.length() == 0
124                     || curLine.charAt(0) != LOG_LINE_START_CHAR) {
125                 continue;
126             }
127             String JavaDoc args[] = curLine.split("\\s+");
128             int curLineNumWords = args.length;
129             String JavaDoc firstUrl = args[1];
130             // Ignore DNS log entries
131
if (firstUrl.startsWith("dns:")) {
132                 continue;
133             }
134             if (curLine.startsWith(RecoveryJournal.F_ADD)) {
135                 // Seed URL
136
if (curLineNumWords == 2) {
137                     if (logger.isLoggable(Level.FINE)) {
138                         logger.fine("F_ADD with 2 words --> seed URL (" +
139                             firstUrl + ")");
140                     }
141                     // Add seed the first time we find it
142
if (seedUrlToDiscoveredUrlsMap.get(firstUrl) == null) {
143                         seedUrlToDiscoveredUrlsMap.put(firstUrl,
144                             new HashSet JavaDoc<String JavaDoc>());
145                     }
146                 } else {
147                     // URL found via an earlier seeded / discovered URL
148
// Look for the seed from which firstUrlString came, so
149
// we can collapse new URLString back to it
150
String JavaDoc viaUrl = args[curLineNumWords - 1];
151                     if (logger.isLoggable(Level.FINE)) {
152                         logger.fine("F_ADD with 3+ words --> new URL "
153                                 + firstUrl + " via URL " + viaUrl);
154                     }
155                     String JavaDoc seedForFirstUrl =
156                         (String JavaDoc) crawledUrlToSeedMap.get(viaUrl);
157                     // viaUrlString is a seed URL
158
if (seedForFirstUrl == null) {
159                         if (logger.isLoggable(Level.FINE)) {
160                             logger.fine("\tvia URL is a seed");
161                         }
162                         crawledUrlToSeedMap.put(firstUrl, viaUrl);
163                         seedForFirstUrl = viaUrl;
164                     } else {
165                         if (logger.isLoggable(Level.FINE)) {
166                             logger.fine("\tvia URL discovered via seed URL " +
167                                 seedForFirstUrl);
168                         }
169                         // Collapse
170
crawledUrlToSeedMap.put(firstUrl, seedForFirstUrl);
171                     }
172                     Set JavaDoc<String JavaDoc> theSeedUrlList =
173                         seedUrlToDiscoveredUrlsMap.get(seedForFirstUrl);
174                         if (theSeedUrlList == null) {
175                         String JavaDoc message = "recover log " +
176                                          recoverLogFileName + " at line " +
177                                          reader.getLineNumber() +
178                                          " listed F+ URL (" + viaUrl +
179                                          ") for which found no seed list.";
180                         if (seedNotFoundPrintWriter != null) {
181                             seedNotFoundPrintWriter.println(message);
182                         } else {
183                             throw new SeedUrlNotFoundException(message);
184                         }
185                     } else {
186                         theSeedUrlList.add(firstUrl);
187                     }
188                 }
189             } else if (curLine.startsWith(RecoveryJournal.F_SUCCESS)) {
190                 if (logger.isLoggable(Level.FINE)) {
191                     logger.fine("F_SUCCESS for URL " + firstUrl);
192                 }
193                 successfullyCrawledUrls.add(firstUrl);
194             }
195         }
196         reader.close();
197         if (seedNotFoundPrintWriter != null) {
198             seedNotFoundPrintWriter.close();
199         }
200     }
201
202     /**
203      * Returns seed for urlString (null if seed not found).
204      * @param urlString
205      * @return Seed.
206      */

207     public String JavaDoc getSeedForUrl(String JavaDoc urlString) {
208         return (seedUrlToDiscoveredUrlsMap.get(urlString) != null)?
209                 urlString: crawledUrlToSeedMap.get(urlString);
210     }
211
212     /**
213      * @return Returns the seedUrlToDiscoveredUrlsMap.
214      */

215     public Map JavaDoc getSeedUrlToDiscoveredUrlsMap() {
216         return this.seedUrlToDiscoveredUrlsMap;
217     }
218
219     /**
220      * @return Returns the successfullyCrawledUrls.
221      */

222     public Set JavaDoc getSuccessfullyCrawledUrls() {
223         return this.successfullyCrawledUrls;
224     }
225
226     /**
227      * @return Returns the logger.
228      */

229     public static Logger JavaDoc getLogger() {
230         return logger;
231     }
232
233     private class SuccessfullyCrawledURLsIterator
234     implements Iterator JavaDoc<String JavaDoc> {
235         private String JavaDoc nextValue = null;
236         private Iterator JavaDoc discoveredUrlsIterator;
237
238         public SuccessfullyCrawledURLsIterator(String JavaDoc seedUrlString)
239         throws SeedUrlNotFoundException {
240             Set JavaDoc discoveredUrlList =
241                 (Set JavaDoc)getSeedUrlToDiscoveredUrlsMap().get(seedUrlString);
242             if (discoveredUrlList == null) {
243                 throw new SeedUrlNotFoundException("Seed URL " +
244                     seedUrlString + " not found in seed list");
245             }
246             discoveredUrlsIterator = discoveredUrlList.iterator();
247         }
248
249         /**
250          * Idempotent method (because of null check on nextValue).
251          */

252         private void populateNextValue() {
253             while (nextValue == null & discoveredUrlsIterator.hasNext()) {
254                 String JavaDoc curDiscoveredUrl =
255                     (String JavaDoc)discoveredUrlsIterator.next();
256                 boolean succCrawled = getSuccessfullyCrawledUrls().
257                     contains(curDiscoveredUrl);
258                 if (getLogger().isLoggable(Level.FINE)) {
259                     getLogger().fine("populateNextValue: curDiscoveredUrl=" +
260                             curDiscoveredUrl + ", succCrawled=" +
261                             succCrawled);
262                 }
263                 if (succCrawled)
264                     nextValue = curDiscoveredUrl;
265             }
266         }
267
268         public boolean hasNext() {
269             populateNextValue();
270             return (nextValue != null);
271         }
272
273         public String JavaDoc next() {
274             populateNextValue();
275             String JavaDoc returnValue = nextValue;
276             nextValue = null;
277             return returnValue;
278         }
279
280         /**
281          * Remove operation is unsupported in this Iterator
282          * (will throw UnsupportedOperationException if called).
283          */

284         public void remove() {
285             throw new UnsupportedOperationException JavaDoc(
286                 "SuccessfullyCrawledURLsIterator.remove: not supported.");
287         }
288     }
289
290     public Iterator JavaDoc<String JavaDoc> getIteratorOfURLsSuccessfullyCrawledFromSeedUrl(
291             String JavaDoc seedUrlString) throws SeedUrlNotFoundException {
292         return new SuccessfullyCrawledURLsIterator(seedUrlString);
293     }
294
295     public Collection JavaDoc<String JavaDoc> getSeedCollection() {
296         return seedUrlToDiscoveredUrlsMap.keySet();
297     }
298
299     public static void main(String JavaDoc args[]) {
300         if (args.length < 1) {
301             System.out.println("Usage: RecoveryLogMapper recoverLogFileName");
302             Runtime.getRuntime().exit(-1);
303         }
304         String JavaDoc recoverLogFileName = args[0];
305         try {
306             RecoveryLogMapper myRecoveryLogMapper =
307                 new RecoveryLogMapper(recoverLogFileName);
308             for (String JavaDoc curSeedUrl: myRecoveryLogMapper.getSeedCollection()) {
309                 System.out.println("URLs successfully crawled from seed URL "
310                     + curSeedUrl);
311                 Iterator JavaDoc iteratorOfUrlsCrawledFromSeedUrl =
312                     myRecoveryLogMapper.
313                         getIteratorOfURLsSuccessfullyCrawledFromSeedUrl(
314                             curSeedUrl);
315                 while (iteratorOfUrlsCrawledFromSeedUrl.hasNext()) {
316                     String JavaDoc curCrawledUrlString =
317                         (String JavaDoc)iteratorOfUrlsCrawledFromSeedUrl.next();
318                     System.out.println(" -> " + curCrawledUrlString);
319                 }
320             }
321         } catch (Exception JavaDoc e) {
322             e.printStackTrace();
323         }
324     }
325 }
326
Popular Tags