1 25 26 41 package org.archive.crawler.util; 42 43 import org.archive.crawler.frontier.RecoveryJournal; 44 45 import java.io.File ; 46 import java.io.LineNumberReader ; 47 import java.io.PrintWriter ; 48 import java.io.FileOutputStream ; 49 import java.util.Collection ; 50 import java.util.HashMap ; 51 import java.util.HashSet ; 52 import java.util.Iterator ; 53 import java.util.Map ; 54 import java.util.Set ; 55 import java.util.logging.Level ; 56 import java.util.logging.Logger ; 57 58 public class RecoveryLogMapper { 59 private static final char LOG_LINE_START_CHAR = 60 RecoveryJournal.F_ADD.charAt(0); 61 private static final Logger logger = 62 Logger.getLogger(RecoveryLogMapper.class.getName()); 63 private PrintWriter seedNotFoundPrintWriter = null; 64 65 68 private Map <String ,String > crawledUrlToSeedMap 69 = new HashMap <String ,String >(); 70 71 74 private Map <String ,Set <String >> seedUrlToDiscoveredUrlsMap 75 = new HashMap <String ,Set <String >>(); 76 77 80 private Set <String > successfullyCrawledUrls = new HashSet <String >(); 81 82 93 public RecoveryLogMapper(String recoverLogFileName) 94 throws java.io.FileNotFoundException , java.io.IOException , 95 SeedUrlNotFoundException { 96 load(recoverLogFileName); 97 } 98 99 107 public RecoveryLogMapper(String recoverLogFileName, 108 String seedNotFoundLogFileName) 109 throws java.io.FileNotFoundException , java.io.IOException , 110 SeedUrlNotFoundException { 111 seedNotFoundPrintWriter = new PrintWriter (new FileOutputStream ( 112 seedNotFoundLogFileName)); 113 load(recoverLogFileName); 114 } 115 116 protected void load(String recoverLogFileName) 117 throws java.io.FileNotFoundException , java.io.IOException , 118 SeedUrlNotFoundException { 119 LineNumberReader reader = new LineNumberReader (RecoveryJournal. 120 getBufferedReader(new File (recoverLogFileName))); 121 String curLine = null; 122 while ((curLine = reader.readLine()) != null) { 123 if (curLine.length() == 0 124 || curLine.charAt(0) != LOG_LINE_START_CHAR) { 125 continue; 126 } 127 String args[] = curLine.split("\\s+"); 128 int curLineNumWords = args.length; 129 String firstUrl = args[1]; 130 if (firstUrl.startsWith("dns:")) { 132 continue; 133 } 134 if (curLine.startsWith(RecoveryJournal.F_ADD)) { 135 if (curLineNumWords == 2) { 137 if (logger.isLoggable(Level.FINE)) { 138 logger.fine("F_ADD with 2 words --> seed URL (" + 139 firstUrl + ")"); 140 } 141 if (seedUrlToDiscoveredUrlsMap.get(firstUrl) == null) { 143 seedUrlToDiscoveredUrlsMap.put(firstUrl, 144 new HashSet <String >()); 145 } 146 } else { 147 String viaUrl = args[curLineNumWords - 1]; 151 if (logger.isLoggable(Level.FINE)) { 152 logger.fine("F_ADD with 3+ words --> new URL " 153 + firstUrl + " via URL " + viaUrl); 154 } 155 String seedForFirstUrl = 156 (String ) crawledUrlToSeedMap.get(viaUrl); 157 if (seedForFirstUrl == null) { 159 if (logger.isLoggable(Level.FINE)) { 160 logger.fine("\tvia URL is a seed"); 161 } 162 crawledUrlToSeedMap.put(firstUrl, viaUrl); 163 seedForFirstUrl = viaUrl; 164 } else { 165 if (logger.isLoggable(Level.FINE)) { 166 logger.fine("\tvia URL discovered via seed URL " + 167 seedForFirstUrl); 168 } 169 crawledUrlToSeedMap.put(firstUrl, seedForFirstUrl); 171 } 172 Set <String > theSeedUrlList = 173 seedUrlToDiscoveredUrlsMap.get(seedForFirstUrl); 174 if (theSeedUrlList == null) { 175 String message = "recover log " + 176 recoverLogFileName + " at line " + 177 reader.getLineNumber() + 178 " listed F+ URL (" + viaUrl + 179 ") for which found no seed list."; 180 if (seedNotFoundPrintWriter != null) { 181 seedNotFoundPrintWriter.println(message); 182 } else { 183 throw new SeedUrlNotFoundException(message); 184 } 185 } else { 186 theSeedUrlList.add(firstUrl); 187 } 188 } 189 } else if (curLine.startsWith(RecoveryJournal.F_SUCCESS)) { 190 if (logger.isLoggable(Level.FINE)) { 191 logger.fine("F_SUCCESS for URL " + firstUrl); 192 } 193 successfullyCrawledUrls.add(firstUrl); 194 } 195 } 196 reader.close(); 197 if (seedNotFoundPrintWriter != null) { 198 seedNotFoundPrintWriter.close(); 199 } 200 } 201 202 207 public String getSeedForUrl(String urlString) { 208 return (seedUrlToDiscoveredUrlsMap.get(urlString) != null)? 209 urlString: crawledUrlToSeedMap.get(urlString); 210 } 211 212 215 public Map getSeedUrlToDiscoveredUrlsMap() { 216 return this.seedUrlToDiscoveredUrlsMap; 217 } 218 219 222 public Set getSuccessfullyCrawledUrls() { 223 return this.successfullyCrawledUrls; 224 } 225 226 229 public static Logger getLogger() { 230 return logger; 231 } 232 233 private class SuccessfullyCrawledURLsIterator 234 implements Iterator <String > { 235 private String nextValue = null; 236 private Iterator discoveredUrlsIterator; 237 238 public SuccessfullyCrawledURLsIterator(String seedUrlString) 239 throws SeedUrlNotFoundException { 240 Set discoveredUrlList = 241 (Set )getSeedUrlToDiscoveredUrlsMap().get(seedUrlString); 242 if (discoveredUrlList == null) { 243 throw new SeedUrlNotFoundException("Seed URL " + 244 seedUrlString + " not found in seed list"); 245 } 246 discoveredUrlsIterator = discoveredUrlList.iterator(); 247 } 248 249 252 private void populateNextValue() { 253 while (nextValue == null & discoveredUrlsIterator.hasNext()) { 254 String curDiscoveredUrl = 255 (String )discoveredUrlsIterator.next(); 256 boolean succCrawled = getSuccessfullyCrawledUrls(). 257 contains(curDiscoveredUrl); 258 if (getLogger().isLoggable(Level.FINE)) { 259 getLogger().fine("populateNextValue: curDiscoveredUrl=" + 260 curDiscoveredUrl + ", succCrawled=" + 261 succCrawled); 262 } 263 if (succCrawled) 264 nextValue = curDiscoveredUrl; 265 } 266 } 267 268 public boolean hasNext() { 269 populateNextValue(); 270 return (nextValue != null); 271 } 272 273 public String next() { 274 populateNextValue(); 275 String returnValue = nextValue; 276 nextValue = null; 277 return returnValue; 278 } 279 280 284 public void remove() { 285 throw new UnsupportedOperationException ( 286 "SuccessfullyCrawledURLsIterator.remove: not supported."); 287 } 288 } 289 290 public Iterator <String > getIteratorOfURLsSuccessfullyCrawledFromSeedUrl( 291 String seedUrlString) throws SeedUrlNotFoundException { 292 return new SuccessfullyCrawledURLsIterator(seedUrlString); 293 } 294 295 public Collection <String > getSeedCollection() { 296 return seedUrlToDiscoveredUrlsMap.keySet(); 297 } 298 299 public static void main(String args[]) { 300 if (args.length < 1) { 301 System.out.println("Usage: RecoveryLogMapper recoverLogFileName"); 302 Runtime.getRuntime().exit(-1); 303 } 304 String recoverLogFileName = args[0]; 305 try { 306 RecoveryLogMapper myRecoveryLogMapper = 307 new RecoveryLogMapper(recoverLogFileName); 308 for (String curSeedUrl: myRecoveryLogMapper.getSeedCollection()) { 309 System.out.println("URLs successfully crawled from seed URL " 310 + curSeedUrl); 311 Iterator iteratorOfUrlsCrawledFromSeedUrl = 312 myRecoveryLogMapper. 313 getIteratorOfURLsSuccessfullyCrawledFromSeedUrl( 314 curSeedUrl); 315 while (iteratorOfUrlsCrawledFromSeedUrl.hasNext()) { 316 String curCrawledUrlString = 317 (String )iteratorOfUrlsCrawledFromSeedUrl.next(); 318 System.out.println(" -> " + curCrawledUrlString); 319 } 320 } 321 } catch (Exception e) { 322 e.printStackTrace(); 323 } 324 } 325 } 326 | Popular Tags |