KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > processor > CrawlMapper


1 /* CrawlMapper
2  *
3  * Created on Sep 30, 2005
4  *
5  * Copyright (C) 2005 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.processor;
24
25 import java.io.BufferedOutputStream JavaDoc;
26 import java.io.File JavaDoc;
27 import java.io.FileNotFoundException JavaDoc;
28 import java.io.FileOutputStream JavaDoc;
29 import java.io.PrintWriter JavaDoc;
30 import java.util.HashMap JavaDoc;
31 import java.util.Iterator JavaDoc;
32
33 import javax.management.AttributeNotFoundException JavaDoc;
34
35 import org.archive.crawler.datamodel.CandidateURI;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.datamodel.FetchStatusCodes;
38 import org.archive.crawler.deciderules.DecideRule;
39 import org.archive.crawler.deciderules.DecideRuleSequence;
40 import org.archive.crawler.framework.Processor;
41 import org.archive.crawler.settings.SimpleType;
42 import org.archive.util.ArchiveUtils;
43 import org.archive.util.fingerprint.ArrayLongFPCache;
44
45 import st.ata.util.FPGenerator;
46
47 /**
48  * A simple crawl splitter/mapper, dividing up CandidateURIs/CrawlURIs
49  * between crawlers by diverting some range of URIs to local log files
50  * (which can then be imported to other crawlers).
51  *
52  * May operate on a CrawlURI (typically early in the processing chain) or
53  * its CandidateURI outlinks (late in the processing chain, after
54  * LinksScoper), or both (if inserted and configured in both places).
55  *
56  * <p>Applies a map() method, supplied by a concrete subclass, to
57  * classKeys to map URIs to crawlers by name.
58  *
59  * <p>One crawler name is distinguished as the 'local name'; URIs mapped to
60  * this name are not diverted, but continue to be processed normally.
61  *
62  * <p>If using the JMX importUris operation importing URLs dropped by
63  * a {@link CrawlMapper} instance, use <code>recoveryLog</code> style.
64  *
65  * @author gojomo
66  * @version $Date: 2007/01/13 01:31:25 $, $Revision: 1.8.4.1 $
67  */

68 public abstract class CrawlMapper extends Processor implements FetchStatusCodes {
69     /**
70      * PrintWriter which remembers the File to which it writes.
71      */

72     private class FilePrintWriter extends PrintWriter JavaDoc {
73         File JavaDoc file;
74         public FilePrintWriter(File JavaDoc file) throws FileNotFoundException JavaDoc {
75             super(new BufferedOutputStream JavaDoc(new FileOutputStream JavaDoc(file)));
76             this.file = file;
77         }
78         public File JavaDoc getFile() {
79             return file;
80         }
81     }
82     
83     /** whether to map CrawlURI itself (if status nonpositive) */
84     public static final String JavaDoc ATTR_CHECK_URI = "check-uri";
85     public static final Boolean JavaDoc DEFAULT_CHECK_URI = Boolean.TRUE;
86     
87     /** whether to map CrawlURI's outlinks (if CandidateURIs) */
88     public static final String JavaDoc ATTR_CHECK_OUTLINKS = "check-outlinks";
89     public static final Boolean JavaDoc DEFAULT_CHECK_OUTLINKS = Boolean.TRUE;
90
91     /** decide rules to determine if an outlink is subject to mapping */
92     public static final String JavaDoc ATTR_MAP_OUTLINK_DECIDE_RULES = "decide-rules";
93
94     /** name of local crawler (URIs mapped to here are not diverted) */
95     public static final String JavaDoc ATTR_LOCAL_NAME = "local-name";
96     public static final String JavaDoc DEFAULT_LOCAL_NAME = ".";
97     
98     /** where to log diversions */
99     public static final String JavaDoc ATTR_DIVERSION_DIR = "diversion-dir";
100     public static final String JavaDoc DEFAULT_DIVERSION_DIR = "diversions";
101
102     /** rotate logs when change occurs within this # of digits of timestamp */
103     public static final String JavaDoc ATTR_ROTATION_DIGITS = "rotation-digits";
104     public static final Integer JavaDoc DEFAULT_ROTATION_DIGITS = new Integer JavaDoc(10); // hourly
105

106     /**
107      * Mapping of target crawlers to logs (PrintWriters)
108      */

109     HashMap JavaDoc<String JavaDoc,PrintWriter JavaDoc> diversionLogs
110      = new HashMap JavaDoc<String JavaDoc,PrintWriter JavaDoc>();
111
112     /**
113      * Truncated timestamp prefix for diversion logs; when
114      * current time doesn't match, it's time to close all
115      * current logs.
116      */

117     String JavaDoc logGeneration = "";
118     
119     /** name of the enclosing crawler (URIs mapped here stay put) */
120     protected String JavaDoc localName;
121     
122     protected ArrayLongFPCache cache;
123     
124     /**
125      * Constructor.
126      * @param name Name of this processor.
127      */

128     public CrawlMapper(String JavaDoc name, String JavaDoc description) {
129         super(name, description);
130         addElementToDefinition(new SimpleType(ATTR_LOCAL_NAME,
131             "Name of local crawler node; mappings to this name " +
132             "result in normal processing (no diversion).",
133             DEFAULT_LOCAL_NAME));
134         addElementToDefinition(new SimpleType(ATTR_DIVERSION_DIR,
135             "Directory to write diversion logs.",
136             DEFAULT_DIVERSION_DIR));
137         addElementToDefinition(new SimpleType(ATTR_CHECK_URI,
138             "Whether to apply the mapping to a URI being processed " +
139             "itself, for example early in processing (while its " +
140             "status is still 'unattempted').",
141             DEFAULT_CHECK_URI));
142         addElementToDefinition(new SimpleType(ATTR_CHECK_OUTLINKS,
143             "Whether to apply the mapping to discovered outlinks, " +
144             "for example after extraction has occurred. ",
145             DEFAULT_CHECK_OUTLINKS));
146         addElementToDefinition(new DecideRuleSequence(
147                 ATTR_MAP_OUTLINK_DECIDE_RULES));
148         addElementToDefinition(new SimpleType(ATTR_ROTATION_DIGITS,
149                 "Number of timestamp digits to use as prefix of log " +
150                 "names (grouping all diversions from that period in " +
151                 "a single log). Default is 10 (hourly log rotation).",
152                 DEFAULT_ROTATION_DIGITS));
153     }
154
155
156     protected void innerProcess(CrawlURI curi) {
157         String JavaDoc nowGeneration =
158             ArchiveUtils.get14DigitDate().substring(
159                         0,
160                         ((Integer JavaDoc) getUncheckedAttribute(null,
161                                 ATTR_ROTATION_DIGITS)).intValue());
162         if(!nowGeneration.equals(logGeneration)) {
163             updateGeneration(nowGeneration);
164         }
165         
166         if (curi.getFetchStatus() == 0
167                 && ((Boolean JavaDoc) getUncheckedAttribute(null, ATTR_CHECK_URI))
168                         .booleanValue()) {
169             // apply mapping to the CrawlURI itself
170
String JavaDoc target = map(curi);
171             if(!localName.equals(target)) {
172                 // CrawlURI is mapped to somewhere other than here
173
curi.setFetchStatus(S_BLOCKED_BY_CUSTOM_PROCESSOR);
174                 curi.addAnnotation("to:"+target);
175                 curi.skipToProcessorChain(getController().
176                         getPostprocessorChain());
177                 divertLog(curi,target);
178             } else {
179                 // localName means keep locally; do nothing
180
}
181         }
182         
183         if (curi.getOutLinks().size() > 0 &&
184                 ((Boolean JavaDoc) getUncheckedAttribute(null, ATTR_CHECK_OUTLINKS))
185                         .booleanValue()) {
186             // consider outlinks for mapping
187
Iterator JavaDoc<CandidateURI> iter = curi.getOutCandidates().iterator();
188             while(iter.hasNext()) {
189                 CandidateURI cauri = iter.next();
190                 if (decideToMapOutlink(cauri)) {
191                     // apply mapping to the CandidateURI
192
String JavaDoc target = map(cauri);
193                     if(!localName.equals(target)) {
194                         // CandidateURI is mapped to somewhere other than here
195
iter.remove();
196                         divertLog(cauri,target);
197                     } else {
198                         // localName means keep locally; do nothing
199
}
200                 }
201             }
202         }
203     }
204     
205     protected boolean decideToMapOutlink(CandidateURI cauri) {
206         boolean rejected = getMapOutlinkDecideRule(cauri).decisionFor(cauri)
207                 .equals(DecideRule.REJECT);
208         return !rejected;
209     }
210
211     protected DecideRule getMapOutlinkDecideRule(Object JavaDoc o) {
212         try {
213             return (DecideRule)getAttribute(o, ATTR_MAP_OUTLINK_DECIDE_RULES);
214         } catch (AttributeNotFoundException JavaDoc e) {
215             throw new RuntimeException JavaDoc(e);
216         }
217     }
218     
219     
220     /**
221      * Close and mark as finished all existing diversion logs, and
222      * arrange for new logs to use the new generation prefix.
223      *
224      * @param nowGeneration new generation (timestamp prefix) to use
225      */

226     protected synchronized void updateGeneration(String JavaDoc nowGeneration) {
227         // all existing logs are of a previous generation
228
Iterator JavaDoc iter = diversionLogs.values().iterator();
229         while(iter.hasNext()) {
230             FilePrintWriter writer = (FilePrintWriter) iter.next();
231             writer.close();
232             writer.getFile().renameTo(
233                     new File JavaDoc(writer.getFile().getAbsolutePath()
234                             .replaceFirst("\\.open$", ".divert")));
235         }
236         diversionLogs.clear();
237         logGeneration = nowGeneration;
238     }
239
240     /**
241      * Look up the crawler node name to which the given CandidateURI
242      * should be mapped.
243      *
244      * @param cauri CandidateURI to consider
245      * @return String node name which should handle URI
246      */

247     protected abstract String JavaDoc map(CandidateURI cauri);
248
249     
250     /**
251      * Note the given CandidateURI in the appropriate diversion log.
252      *
253      * @param cauri CandidateURI to append to a diversion log
254      * @param target String node name (log name) to receive URI
255      */

256     protected synchronized void divertLog(CandidateURI cauri, String JavaDoc target) {
257         if(recentlySeen(cauri)) {
258             return;
259         }
260         PrintWriter JavaDoc diversionLog = getDiversionLog(target);
261         cauri.singleLineReportTo(diversionLog);
262         diversionLog.println();
263     }
264     
265     /**
266      * Consult the cache to determine if the given URI
267      * has been recently seen -- entering it if not.
268      *
269      * @param cauri CandidateURI to test
270      * @return true if URI was already in the cache; false otherwise
271      */

272     private boolean recentlySeen(CandidateURI cauri) {
273         long fp = FPGenerator.std64.fp(cauri.toString());
274         return ! cache.add(fp);
275     }
276
277     /**
278      * Get the diversion log for a given target crawler node node.
279      *
280      * @param target crawler node name of requested log
281      * @return PrintWriter open on an appropriately-named
282      * log file
283      */

284     protected PrintWriter JavaDoc getDiversionLog(String JavaDoc target) {
285         FilePrintWriter writer = (FilePrintWriter) diversionLogs.get(target);
286         if(writer == null) {
287             String JavaDoc divertDirPath = (String JavaDoc) getUncheckedAttribute(null,ATTR_DIVERSION_DIR);
288             File JavaDoc divertDir = new File JavaDoc(divertDirPath);
289             if (!divertDir.isAbsolute()) {
290                 divertDir = new File JavaDoc(getSettingsHandler().getOrder()
291                         .getController().getDisk(), divertDirPath);
292             }
293             divertDir.mkdirs();
294             File JavaDoc divertLog =
295                 new File JavaDoc(divertDir,
296                          logGeneration+"-"+localName+"-to-"+target+".open");
297             try {
298                 writer = new FilePrintWriter(divertLog);
299             } catch (FileNotFoundException JavaDoc e) {
300                 // TODO Auto-generated catch block
301
e.printStackTrace();
302                 throw new RuntimeException JavaDoc(e);
303             }
304             diversionLogs.put(target,writer);
305         }
306         return writer;
307     }
308
309     protected void initialTasks() {
310         super.initialTasks();
311         localName = (String JavaDoc) getUncheckedAttribute(null, ATTR_LOCAL_NAME);
312         cache = new ArrayLongFPCache();
313     }
314 }
Popular Tags