CrawlMapper


1   /* CrawlMapper
2    * 
3    * Created on Sep 30, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor;
24  
25  import java.io.BufferedOutputStream  ;
26  import java.io.File  ;
27  import java.io.FileNotFoundException  ;
28  import java.io.FileOutputStream  ;
29  import java.io.PrintWriter  ;
30  import java.util.HashMap  ;
31  import java.util.Iterator  ;
32  
33  import javax.management.AttributeNotFoundException  ;
34  
35  import org.archive.crawler.datamodel.CandidateURI;
36  import org.archive.crawler.datamodel.CrawlURI;
37  import org.archive.crawler.datamodel.FetchStatusCodes;
38  import org.archive.crawler.deciderules.DecideRule;
39  import org.archive.crawler.deciderules.DecideRuleSequence;
40  import org.archive.crawler.framework.Processor;
41  import org.archive.crawler.settings.SimpleType;
42  import org.archive.util.ArchiveUtils;
43  import org.archive.util.fingerprint.ArrayLongFPCache;
44  
45  import st.ata.util.FPGenerator;
46  
47  /**
48   * A simple crawl splitter/mapper, dividing up CandidateURIs/CrawlURIs
49   * between crawlers by diverting some range of URIs to local log files
50   * (which can then be imported to other crawlers). 
51   * 
52   * May operate on a CrawlURI (typically early in the processing chain) or
53   * its CandidateURI outlinks (late in the processing chain, after 
54   * LinksScoper), or both (if inserted and configured in both places). 
55   * 
56   * <p>Applies a map() method, supplied by a concrete subclass, to
57   * classKeys to map URIs to crawlers by name. 
58   *
59   * <p>One crawler name is distinguished as the 'local name'; URIs mapped to
60   * this name are not diverted, but continue to be processed normally.
61   *
62   * <p>If using the JMX importUris operation importing URLs dropped by
63   * a {@link CrawlMapper} instance, use <code>recoveryLog</code> style.
64   * 
65   * @author gojomo
66   * @version $Date: 2007/01/13 01:31:25 $, $Revision: 1.8.4.1 $
67   */
68  public abstract class CrawlMapper extends Processor implements FetchStatusCodes {
69      /**
70       * PrintWriter which remembers the File to which it writes. 
71       */
72      private class FilePrintWriter extends PrintWriter   {
73          File   file; 
74          public FilePrintWriter(File   file) throws FileNotFoundException   {
75              super(new BufferedOutputStream  (new FileOutputStream  (file)));
76              this.file = file; 
77          }
78          public File   getFile() {
79              return file;
80          }
81      }
82      
83      /** whether to map CrawlURI itself (if status nonpositive) */
84      public static final String   ATTR_CHECK_URI = "check-uri";
85      public static final Boolean   DEFAULT_CHECK_URI = Boolean.TRUE;
86      
87      /** whether to map CrawlURI's outlinks (if CandidateURIs) */
88      public static final String   ATTR_CHECK_OUTLINKS = "check-outlinks";
89      public static final Boolean   DEFAULT_CHECK_OUTLINKS = Boolean.TRUE;
90  
91      /** decide rules to determine if an outlink is subject to mapping */ 
92      public static final String   ATTR_MAP_OUTLINK_DECIDE_RULES = "decide-rules";
93  
94      /** name of local crawler (URIs mapped to here are not diverted) */
95      public static final String   ATTR_LOCAL_NAME = "local-name";
96      public static final String   DEFAULT_LOCAL_NAME = ".";
97      
98      /** where to log diversions  */
99      public static final String   ATTR_DIVERSION_DIR = "diversion-dir";
100     public static final String   DEFAULT_DIVERSION_DIR = "diversions";
101 
102     /** rotate logs when change occurs within this # of digits of timestamp  */
103     public static final String   ATTR_ROTATION_DIGITS = "rotation-digits";
104     public static final Integer   DEFAULT_ROTATION_DIGITS = new Integer  (10); // hourly
105     
106     /**
107      * Mapping of target crawlers to logs (PrintWriters)
108      */
109     HashMap  <String  ,PrintWriter  > diversionLogs
110      = new HashMap  <String  ,PrintWriter  >();
111 
112     /**
113      * Truncated timestamp prefix for diversion logs; when
114      * current time doesn't match, it's time to close all
115      * current logs. 
116      */
117     String   logGeneration = "";
118     
119     /** name of the enclosing crawler (URIs mapped here stay put) */
120     protected String   localName;
121     
122     protected ArrayLongFPCache cache;
123     
124     /**
125      * Constructor.
126      * @param name Name of this processor.
127      */
128     public CrawlMapper(String   name, String   description) {
129         super(name, description);
130         addElementToDefinition(new SimpleType(ATTR_LOCAL_NAME,
131             "Name of local crawler node; mappings to this name " +
132             "result in normal processing (no diversion).",
133             DEFAULT_LOCAL_NAME));
134         addElementToDefinition(new SimpleType(ATTR_DIVERSION_DIR,
135             "Directory to write diversion logs.",
136             DEFAULT_DIVERSION_DIR));
137         addElementToDefinition(new SimpleType(ATTR_CHECK_URI,
138             "Whether to apply the mapping to a URI being processed " +
139             "itself, for example early in processing (while its " +
140             "status is still 'unattempted').",
141             DEFAULT_CHECK_URI));
142         addElementToDefinition(new SimpleType(ATTR_CHECK_OUTLINKS,
143             "Whether to apply the mapping to discovered outlinks, " +
144             "for example after extraction has occurred. ",
145             DEFAULT_CHECK_OUTLINKS));
146         addElementToDefinition(new DecideRuleSequence(
147                 ATTR_MAP_OUTLINK_DECIDE_RULES));
148         addElementToDefinition(new SimpleType(ATTR_ROTATION_DIGITS,
149                 "Number of timestamp digits to use as prefix of log " +
150                 "names (grouping all diversions from that period in " +
151                 "a single log). Default is 10 (hourly log rotation).",
152                 DEFAULT_ROTATION_DIGITS));
153     }
154 
155 
156     protected void innerProcess(CrawlURI curi) {
157         String   nowGeneration = 
158             ArchiveUtils.get14DigitDate().substring(
159                         0,
160                         ((Integer  ) getUncheckedAttribute(null,
161                                 ATTR_ROTATION_DIGITS)).intValue());
162         if(!nowGeneration.equals(logGeneration)) {
163             updateGeneration(nowGeneration);
164         }
165         
166         if (curi.getFetchStatus() == 0
167                 && ((Boolean  ) getUncheckedAttribute(null, ATTR_CHECK_URI))
168                         .booleanValue()) {
169             // apply mapping to the CrawlURI itself
170             String   target = map(curi);
171             if(!localName.equals(target)) {
172                 // CrawlURI is mapped to somewhere other than here
173                 curi.setFetchStatus(S_BLOCKED_BY_CUSTOM_PROCESSOR);
174                 curi.addAnnotation("to:"+target);
175                 curi.skipToProcessorChain(getController().
176                         getPostprocessorChain());
177                 divertLog(curi,target);
178             } else {
179                 // localName means keep locally; do nothing
180             }
181         }
182         
183         if (curi.getOutLinks().size() > 0 && 
184                 ((Boolean  ) getUncheckedAttribute(null, ATTR_CHECK_OUTLINKS))
185                         .booleanValue()) {
186             // consider outlinks for mapping
187             Iterator  <CandidateURI> iter = curi.getOutCandidates().iterator(); 
188             while(iter.hasNext()) {
189                 CandidateURI cauri = iter.next();
190                 if (decideToMapOutlink(cauri)) {
191                     // apply mapping to the CandidateURI
192                     String   target = map(cauri);
193                     if(!localName.equals(target)) {
194                         // CandidateURI is mapped to somewhere other than here
195                         iter.remove();
196                         divertLog(cauri,target);
197                     } else {
198                         // localName means keep locally; do nothing
199                     }
200                 }
201             }
202         }
203     }
204     
205     protected boolean decideToMapOutlink(CandidateURI cauri) {
206         boolean rejected = getMapOutlinkDecideRule(cauri).decisionFor(cauri)
207                 .equals(DecideRule.REJECT);
208         return !rejected;
209     }
210 
211     protected DecideRule getMapOutlinkDecideRule(Object   o) {
212         try {
213             return (DecideRule)getAttribute(o, ATTR_MAP_OUTLINK_DECIDE_RULES);
214         } catch (AttributeNotFoundException   e) {
215             throw new RuntimeException  (e);
216         }
217     }
218     
219     
220     /**
221      * Close and mark as finished all existing diversion logs, and
222      * arrange for new logs to use the new generation prefix.
223      * 
224      * @param nowGeneration new generation (timestamp prefix) to use
225      */
226     protected synchronized void updateGeneration(String   nowGeneration) {
227         // all existing logs are of a previous generation
228         Iterator   iter = diversionLogs.values().iterator();
229         while(iter.hasNext()) {
230             FilePrintWriter writer = (FilePrintWriter) iter.next();
231             writer.close();
232             writer.getFile().renameTo(
233                     new File  (writer.getFile().getAbsolutePath()
234                             .replaceFirst("\\.open$", ".divert")));
235         }
236         diversionLogs.clear();
237         logGeneration = nowGeneration;
238     }
239 
240     /**
241      * Look up the crawler node name to which the given CandidateURI 
242      * should be mapped. 
243      * 
244      * @param cauri CandidateURI to consider
245      * @return String node name which should handle URI
246      */
247     protected abstract String   map(CandidateURI cauri);
248 
249     
250     /**
251      * Note the given CandidateURI in the appropriate diversion log. 
252      * 
253      * @param cauri CandidateURI to append to a diversion log
254      * @param target String node name (log name) to receive URI
255      */
256     protected synchronized void divertLog(CandidateURI cauri, String   target) {
257         if(recentlySeen(cauri)) {
258             return;
259         }
260         PrintWriter   diversionLog = getDiversionLog(target);
261         cauri.singleLineReportTo(diversionLog);
262         diversionLog.println();
263     }
264     
265     /**
266      * Consult the cache to determine if the given URI
267      * has been recently seen -- entering it if not. 
268      * 
269      * @param cauri CandidateURI to test
270      * @return true if URI was already in the cache; false otherwise 
271      */
272     private boolean recentlySeen(CandidateURI cauri) {
273         long fp = FPGenerator.std64.fp(cauri.toString());
274         return ! cache.add(fp);
275     }
276 
277     /**
278      * Get the diversion log for a given target crawler node node. 
279      * 
280      * @param target crawler node name of requested log
281      * @return PrintWriter open on an appropriately-named 
282      * log file
283      */
284     protected PrintWriter   getDiversionLog(String   target) {
285         FilePrintWriter writer = (FilePrintWriter) diversionLogs.get(target);
286         if(writer == null) {
287             String   divertDirPath = (String  ) getUncheckedAttribute(null,ATTR_DIVERSION_DIR);
288             File   divertDir = new File  (divertDirPath);
289             if (!divertDir.isAbsolute()) {
290                 divertDir = new File  (getSettingsHandler().getOrder()
291                         .getController().getDisk(), divertDirPath);
292             }
293             divertDir.mkdirs();
294             File   divertLog = 
295                 new File  (divertDir,
296                          logGeneration+"-"+localName+"-to-"+target+".open");
297             try {
298                 writer = new FilePrintWriter(divertLog);
299             } catch (FileNotFoundException   e) {
300                 // TODO Auto-generated catch block
301                 e.printStackTrace();
302                 throw new RuntimeException  (e);
303             }
304             diversionLogs.put(target,writer);
305         } 
306         return writer;
307     }
308 
309     protected void initialTasks() {
310         super.initialTasks();
311         localName = (String  ) getUncheckedAttribute(null, ATTR_LOCAL_NAME);
312         cache = new ArrayLongFPCache();
313     }
314 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags