1 23 package org.archive.crawler.extractor; 24 25 import java.io.File ; 26 import java.io.IOException ; 27 import java.lang.reflect.Constructor ; 28 import java.util.ArrayList ; 29 import java.util.Iterator ; 30 import java.util.List ; 31 import java.util.logging.ConsoleHandler ; 32 import java.util.logging.Handler ; 33 import java.util.logging.Logger ; 34 35 import javax.management.Attribute ; 36 37 import org.apache.commons.cli.CommandLine; 38 import org.apache.commons.cli.HelpFormatter; 39 import org.apache.commons.cli.Option; 40 import org.apache.commons.cli.Options; 41 import org.apache.commons.cli.PosixParser; 42 import org.apache.commons.httpclient.Header; 43 import org.apache.commons.httpclient.HttpMethodBase; 44 import org.apache.commons.httpclient.URIException; 45 import org.archive.crawler.datamodel.CoreAttributeConstants; 46 import org.archive.crawler.datamodel.CrawlOrder; 47 import org.archive.crawler.datamodel.CrawlURI; 48 import org.archive.crawler.framework.Processor; 49 import org.archive.crawler.settings.CrawlerSettings; 50 import org.archive.crawler.settings.MapType; 51 import org.archive.crawler.settings.SettingsHandler; 52 import org.archive.crawler.settings.XMLSettingsHandler; 53 import org.archive.io.arc.ARCReader; 54 import org.archive.io.arc.ARCReaderFactory; 55 import org.archive.io.arc.ARCRecord; 56 import org.archive.net.UURIFactory; 57 import org.archive.util.HttpRecorder; 58 import org.archive.util.OneLineSimpleLogger; 59 60 73 public class ExtractorTool { 74 static { 77 Handler [] hs = Logger.getLogger("").getHandlers(); 79 for (int i = 0; i < hs.length; i++) { 80 Handler h = hs[0]; 81 if (h instanceof ConsoleHandler ) { 82 h.setFormatter(new OneLineSimpleLogger()); 83 } 84 } 85 } 86 87 private static final String [] DEFAULT_EXTRACTORS = 88 {"org.archive.crawler.extractor.ExtractorHTTP", 89 "org.archive.crawler.extractor.ExtractorHTML"}; 90 private final List <Processor> extractors; 91 private final File scratchDir; 92 private static final String DEFAULT_SCRATCH = "/tmp"; 93 94 public ExtractorTool() 95 throws Exception { 96 this(DEFAULT_EXTRACTORS, DEFAULT_SCRATCH); 97 } 98 99 public ExtractorTool(String [] e, String scratch) 100 throws Exception { 101 super(); 102 this.scratchDir = scratch == null? 104 new File (DEFAULT_SCRATCH): new File (scratch); 105 if (!this.scratchDir.exists()) { 106 this.scratchDir.mkdirs(); 107 } 108 File orderFile = new File (this.scratchDir.getAbsolutePath(), 110 ExtractorTool.class.getName() + "_order.xml"); 111 SettingsHandler settingsHandler = new XMLSettingsHandler(orderFile); 112 settingsHandler.initialize(); 113 settingsHandler.getOrder(). 114 setAttribute(new Attribute (CrawlOrder.ATTR_SETTINGS_DIRECTORY, 115 this.scratchDir.getAbsolutePath())); 116 CrawlerSettings globalSettings = 117 settingsHandler.getSettingsObject(null); 118 MapType extractorsSettings = (MapType)settingsHandler.getOrder(). 119 getAttribute(CrawlOrder.ATTR_EXTRACT_PROCESSORS); 120 this.extractors = new ArrayList <Processor>(); 121 for (int i = 0; i < e.length; i++) { 122 Constructor c = Class.forName(e[i]). 123 getConstructor(new Class [] {String .class}); 124 String name = Integer.toString(i); 125 Processor p = (Processor)c.newInstance(new Object [] {name}); 126 extractorsSettings.addElement(globalSettings, p); 127 p.setAttribute( 128 new Attribute (Processor.ATTR_ENABLED, Boolean.TRUE)); 129 this.extractors.add(p); 130 } 131 } 132 133 public void extract(String resource) throws IOException , 134 URIException, InterruptedException { 135 ARCReader reader = ARCReaderFactory.get(new File (resource)); 136 for (Iterator i = reader.iterator(); i.hasNext();) { 137 ARCRecord ar = (ARCRecord)i.next(); 138 HttpRecorder hr = HttpRecorder. 139 wrapInputStreamWithHttpRecord(this.scratchDir, 140 this.getClass().getName(), ar, null); 141 CrawlURI curi = getCrawlURI(ar, hr); 142 for (Iterator ii = this.extractors.iterator(); ii.hasNext();) { 143 ((Processor)ii.next()).process(curi); 144 } 145 outlinks(curi); 146 } 147 } 148 149 protected void outlinks(CrawlURI curi) { 150 System.out.println(curi.getUURI().toString()); 151 for(Link l: curi.getOutLinks()) { 152 System.out.println(" " + l.getDestination() + " " + 153 l.getHopType() + " " + l.getContext()); 154 } 155 } 156 157 protected CrawlURI getCrawlURI(final ARCRecord record, 158 final HttpRecorder hr) 159 throws URIException { 160 CrawlURI curi = new CrawlURI(UURIFactory. 161 getInstance(record.getMetaData().getUrl())); 162 curi.setContentSize(record.getMetaData().getLength()); 163 curi.setContentType(record.getMetaData().getMimetype()); 164 curi.setHttpRecorder(hr); 165 if (!curi.getUURI().getScheme().equals("filedesc")) { 167 curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION, 168 new HttpMethodBase() { 169 public String getName() { 170 return this.getClass().getName() + "_method"; 171 } 172 173 public Header getResponseHeader(String headerName) { 174 String value = (String )record.getMetaData(). 175 getHeaderValue(headerName); 176 return (value == null || value.length() == 0)? 177 null: new Header(headerName, value); 178 } 179 }); 180 String statusCode = record.getMetaData().getStatusCode(); 181 curi.setFetchStatus(statusCode == null? 182 200: Integer.parseInt(statusCode)); 183 } 184 return curi; 185 } 186 187 193 private static void usage(HelpFormatter formatter, Options options, 194 int exitCode) { 195 formatter.printHelp("java " + ExtractorTool.class.getName() + 196 " \\\n[--scratch=DIR] [--extractor=EXTRACTOR1,EXTRACTOR2,...] ARC", options); 197 System.exit(exitCode); 198 } 199 200 public static void main(String [] args) 201 throws Exception { 202 Options options = new Options(); 203 options.addOption(new Option("h", "help", false, 204 "Prints this message and exits.")); 205 StringBuffer defaultExtractors = new StringBuffer (); 206 for (int i = 0; i < DEFAULT_EXTRACTORS.length; i++) { 207 if (i > 0) { 208 defaultExtractors.append(", "); 209 } 210 defaultExtractors.append(DEFAULT_EXTRACTORS[i]); 211 } 212 options.addOption(new Option("e", "extractor", true, 213 "List of comma-separated extractor class names. " + 214 "Run in order listed. " + 215 "If no extractors listed, runs following: " + 216 defaultExtractors.toString() + ".")); 217 options.addOption(new Option("s", "scratch", true, 218 "Directory to write scratch files to. Default: '/tmp'.")); 219 PosixParser parser = new PosixParser(); 220 CommandLine cmdline = parser.parse(options, args, false); 221 List cmdlineArgs = cmdline.getArgList(); 222 Option [] cmdlineOptions = cmdline.getOptions(); 223 HelpFormatter formatter = new HelpFormatter(); 224 if (cmdlineArgs.size() <= 0) { 226 usage(formatter, options, 0); 227 } 228 229 String [] extractors = DEFAULT_EXTRACTORS; 231 String scratch = null; 232 for (int i = 0; i < cmdlineOptions.length; i++) { 233 switch(cmdlineOptions[i].getId()) { 234 case 'h': 235 usage(formatter, options, 0); 236 break; 237 238 case 'e': 239 String value = cmdlineOptions[i].getValue(); 240 if (value == null || value.length() <= 0) { 241 extractors = new String [0]; 245 } else { 246 extractors = value.split(","); 247 } 248 break; 249 250 case 's': 251 scratch = cmdlineOptions[i].getValue(); 252 break; 253 254 default: 255 throw new RuntimeException ("Unexpected option: " + 256 + cmdlineOptions[i].getId()); 257 } 258 } 259 260 ExtractorTool tool = new ExtractorTool(extractors, scratch); 261 for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) { 262 tool.extract((String )i.next()); 263 } 264 } 265 } 266 | Popular Tags |