1 24 package org.archive.crawler.framework; 25 26 import java.lang.reflect.Constructor ; 27 import java.util.Iterator ; 28 import java.util.logging.Level ; 29 import java.util.logging.Logger ; 30 31 import javax.management.AttributeNotFoundException ; 32 33 import org.archive.crawler.datamodel.CrawlURI; 34 import org.archive.crawler.settings.MapType; 35 import org.archive.crawler.settings.ModuleType; 36 import org.archive.crawler.settings.SimpleType; 37 38 54 public class Processor extends ModuleType { 55 56 private static final long serialVersionUID = 6248563827413710226L; 57 58 61 public final static String ATTR_FILTERS = "filters"; 62 63 66 public final static String ATTR_ENABLED = "enabled"; 67 68 private MapType filters; 69 private Processor defaultNextProcessor = null; 70 71 private static Logger logger = 72 Logger.getLogger("org.archive.crawler.framework.Processor"); 73 74 78 public Processor(String name, String description) { 79 super(name, description); 80 addElementToDefinition(new SimpleType(ATTR_ENABLED, 81 "Is processor enabled", new Boolean (true))); 82 filters = (MapType) addElementToDefinition(new MapType(ATTR_FILTERS, 83 "Filters applied to this processor", Filter.class)); 84 } 85 86 92 public final void process(CrawlURI curi) throws InterruptedException { 93 curi.setNextProcessor(getDefaultNextProcessor(curi)); 95 96 try { 98 if (!((Boolean ) getAttribute(ATTR_ENABLED, curi)).booleanValue()) { 99 return; 100 } 101 } catch (AttributeNotFoundException e) { 102 logger.severe(e.getMessage()); 103 } 104 105 if(filtersAccept(curi)) { 106 innerProcess(curi); 107 } else { 108 innerRejectProcess(curi); 109 } 110 } 111 112 protected void checkForInterrupt() throws InterruptedException { 113 if (Thread.interrupted()) { 114 throw new InterruptedException ("interrupted"); 115 } 116 } 117 118 122 protected void innerRejectProcess(CrawlURI curi) 123 throws InterruptedException { 124 } 126 127 134 protected void innerProcess(CrawlURI curi) 135 throws InterruptedException { 136 } 138 139 147 protected void initialTasks () { 148 } 150 151 156 protected void finalTasks () { 157 } 159 160 166 protected boolean filtersAccept(CrawlURI curi) { 167 return filtersAccept(this.filters, curi); 168 } 169 170 177 protected boolean filtersAccept(MapType fs, CrawlURI curi) { 178 if (fs.isEmpty(curi)) { 179 return true; 180 } 181 for (Iterator i = fs.iterator(curi); i.hasNext();) { 182 Filter filter = (Filter)i.next(); 183 if (!filter.accepts(curi)) { 184 if (logger.isLoggable(Level.INFO)) { 185 logger.info(filter + " rejected " + curi + 186 " in Processor " + getName()); 187 } 188 return false; 189 } 190 } 191 return true; 192 } 193 194 199 public Processor getDefaultNextProcessor(CrawlURI curi) { 200 return defaultNextProcessor; 201 } 202 203 207 public void setDefaultNextProcessor(Processor nextProcessor) { 208 defaultNextProcessor = nextProcessor; 209 } 210 211 216 public CrawlController getController() { 217 return getSettingsHandler().getOrder().getController(); 218 } 219 220 public Processor spawn(int serialNum) { 221 Processor newInst = null; 222 try { 223 Constructor co = 224 getClass().getConstructor(new Class [] { String .class }); 225 newInst = 226 (Processor) co.newInstance(new Object [] { 227 getName() + serialNum 228 }); 229 getParent().setAttribute(newInst); 230 newInst.setTransient(true); 231 } catch (Exception e) { 232 e.printStackTrace(); 234 } 235 return newInst; 236 } 237 238 250 public String report(){ 251 return ""; } 253 254 259 protected boolean isContentToProcess(CrawlURI curi) { 260 return !curi.hasBeenLinkExtracted() && curi.getContentLength() > 0; 261 } 262 263 268 protected boolean isHttpTransactionContentToProcess(CrawlURI curi) { 269 return isContentToProcess(curi) && 270 curi.isHttpTransaction() && 271 curi.isSuccess(); 272 } 273 274 281 protected boolean isExpectedMimeType(String contentType, 282 String expectedPrefix) { 283 return contentType != null && 284 contentType.toLowerCase().startsWith(expectedPrefix); 285 } 286 287 public void kickUpdate() { 288 } 290 } 291 | Popular Tags |