1 23 package org.archive.crawler.processor; 24 25 import java.io.BufferedOutputStream ; 26 import java.io.File ; 27 import java.io.FileNotFoundException ; 28 import java.io.FileOutputStream ; 29 import java.io.PrintWriter ; 30 import java.util.HashMap ; 31 import java.util.Iterator ; 32 33 import javax.management.AttributeNotFoundException ; 34 35 import org.archive.crawler.datamodel.CandidateURI; 36 import org.archive.crawler.datamodel.CrawlURI; 37 import org.archive.crawler.datamodel.FetchStatusCodes; 38 import org.archive.crawler.deciderules.DecideRule; 39 import org.archive.crawler.deciderules.DecideRuleSequence; 40 import org.archive.crawler.framework.Processor; 41 import org.archive.crawler.settings.SimpleType; 42 import org.archive.util.ArchiveUtils; 43 import org.archive.util.fingerprint.ArrayLongFPCache; 44 45 import st.ata.util.FPGenerator; 46 47 68 public abstract class CrawlMapper extends Processor implements FetchStatusCodes { 69 72 private class FilePrintWriter extends PrintWriter { 73 File file; 74 public FilePrintWriter(File file) throws FileNotFoundException { 75 super(new BufferedOutputStream (new FileOutputStream (file))); 76 this.file = file; 77 } 78 public File getFile() { 79 return file; 80 } 81 } 82 83 84 public static final String ATTR_CHECK_URI = "check-uri"; 85 public static final Boolean DEFAULT_CHECK_URI = Boolean.TRUE; 86 87 88 public static final String ATTR_CHECK_OUTLINKS = "check-outlinks"; 89 public static final Boolean DEFAULT_CHECK_OUTLINKS = Boolean.TRUE; 90 91 92 public static final String ATTR_MAP_OUTLINK_DECIDE_RULES = "decide-rules"; 93 94 95 public static final String ATTR_LOCAL_NAME = "local-name"; 96 public static final String DEFAULT_LOCAL_NAME = "."; 97 98 99 public static final String ATTR_DIVERSION_DIR = "diversion-dir"; 100 public static final String DEFAULT_DIVERSION_DIR = "diversions"; 101 102 103 public static final String ATTR_ROTATION_DIGITS = "rotation-digits"; 104 public static final Integer DEFAULT_ROTATION_DIGITS = new Integer (10); 106 109 HashMap <String ,PrintWriter > diversionLogs 110 = new HashMap <String ,PrintWriter >(); 111 112 117 String logGeneration = ""; 118 119 120 protected String localName; 121 122 protected ArrayLongFPCache cache; 123 124 128 public CrawlMapper(String name, String description) { 129 super(name, description); 130 addElementToDefinition(new SimpleType(ATTR_LOCAL_NAME, 131 "Name of local crawler node; mappings to this name " + 132 "result in normal processing (no diversion).", 133 DEFAULT_LOCAL_NAME)); 134 addElementToDefinition(new SimpleType(ATTR_DIVERSION_DIR, 135 "Directory to write diversion logs.", 136 DEFAULT_DIVERSION_DIR)); 137 addElementToDefinition(new SimpleType(ATTR_CHECK_URI, 138 "Whether to apply the mapping to a URI being processed " + 139 "itself, for example early in processing (while its " + 140 "status is still 'unattempted').", 141 DEFAULT_CHECK_URI)); 142 addElementToDefinition(new SimpleType(ATTR_CHECK_OUTLINKS, 143 "Whether to apply the mapping to discovered outlinks, " + 144 "for example after extraction has occurred. ", 145 DEFAULT_CHECK_OUTLINKS)); 146 addElementToDefinition(new DecideRuleSequence( 147 ATTR_MAP_OUTLINK_DECIDE_RULES)); 148 addElementToDefinition(new SimpleType(ATTR_ROTATION_DIGITS, 149 "Number of timestamp digits to use as prefix of log " + 150 "names (grouping all diversions from that period in " + 151 "a single log). Default is 10 (hourly log rotation).", 152 DEFAULT_ROTATION_DIGITS)); 153 } 154 155 156 protected void innerProcess(CrawlURI curi) { 157 String nowGeneration = 158 ArchiveUtils.get14DigitDate().substring( 159 0, 160 ((Integer ) getUncheckedAttribute(null, 161 ATTR_ROTATION_DIGITS)).intValue()); 162 if(!nowGeneration.equals(logGeneration)) { 163 updateGeneration(nowGeneration); 164 } 165 166 if (curi.getFetchStatus() == 0 167 && ((Boolean ) getUncheckedAttribute(null, ATTR_CHECK_URI)) 168 .booleanValue()) { 169 String target = map(curi); 171 if(!localName.equals(target)) { 172 curi.setFetchStatus(S_BLOCKED_BY_CUSTOM_PROCESSOR); 174 curi.addAnnotation("to:"+target); 175 curi.skipToProcessorChain(getController(). 176 getPostprocessorChain()); 177 divertLog(curi,target); 178 } else { 179 } 181 } 182 183 if (curi.getOutLinks().size() > 0 && 184 ((Boolean ) getUncheckedAttribute(null, ATTR_CHECK_OUTLINKS)) 185 .booleanValue()) { 186 Iterator <CandidateURI> iter = curi.getOutCandidates().iterator(); 188 while(iter.hasNext()) { 189 CandidateURI cauri = iter.next(); 190 if (decideToMapOutlink(cauri)) { 191 String target = map(cauri); 193 if(!localName.equals(target)) { 194 iter.remove(); 196 divertLog(cauri,target); 197 } else { 198 } 200 } 201 } 202 } 203 } 204 205 protected boolean decideToMapOutlink(CandidateURI cauri) { 206 boolean rejected = getMapOutlinkDecideRule(cauri).decisionFor(cauri) 207 .equals(DecideRule.REJECT); 208 return !rejected; 209 } 210 211 protected DecideRule getMapOutlinkDecideRule(Object o) { 212 try { 213 return (DecideRule)getAttribute(o, ATTR_MAP_OUTLINK_DECIDE_RULES); 214 } catch (AttributeNotFoundException e) { 215 throw new RuntimeException (e); 216 } 217 } 218 219 220 226 protected synchronized void updateGeneration(String nowGeneration) { 227 Iterator iter = diversionLogs.values().iterator(); 229 while(iter.hasNext()) { 230 FilePrintWriter writer = (FilePrintWriter) iter.next(); 231 writer.close(); 232 writer.getFile().renameTo( 233 new File (writer.getFile().getAbsolutePath() 234 .replaceFirst("\\.open$", ".divert"))); 235 } 236 diversionLogs.clear(); 237 logGeneration = nowGeneration; 238 } 239 240 247 protected abstract String map(CandidateURI cauri); 248 249 250 256 protected synchronized void divertLog(CandidateURI cauri, String target) { 257 if(recentlySeen(cauri)) { 258 return; 259 } 260 PrintWriter diversionLog = getDiversionLog(target); 261 cauri.singleLineReportTo(diversionLog); 262 diversionLog.println(); 263 } 264 265 272 private boolean recentlySeen(CandidateURI cauri) { 273 long fp = FPGenerator.std64.fp(cauri.toString()); 274 return ! cache.add(fp); 275 } 276 277 284 protected PrintWriter getDiversionLog(String target) { 285 FilePrintWriter writer = (FilePrintWriter) diversionLogs.get(target); 286 if(writer == null) { 287 String divertDirPath = (String ) getUncheckedAttribute(null,ATTR_DIVERSION_DIR); 288 File divertDir = new File (divertDirPath); 289 if (!divertDir.isAbsolute()) { 290 divertDir = new File (getSettingsHandler().getOrder() 291 .getController().getDisk(), divertDirPath); 292 } 293 divertDir.mkdirs(); 294 File divertLog = 295 new File (divertDir, 296 logGeneration+"-"+localName+"-to-"+target+".open"); 297 try { 298 writer = new FilePrintWriter(divertLog); 299 } catch (FileNotFoundException e) { 300 e.printStackTrace(); 302 throw new RuntimeException (e); 303 } 304 diversionLogs.put(target,writer); 305 } 306 return writer; 307 } 308 309 protected void initialTasks() { 310 super.initialTasks(); 311 localName = (String ) getUncheckedAttribute(null, ATTR_LOCAL_NAME); 312 cache = new ArrayLongFPCache(); 313 } 314 } | Popular Tags |