KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorTool


1 /* ExtractorTool
2  *
3  * Created on Mar 14, 2005
4  *
5  * Copyright (C) 2005 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.extractor;
24
25 import java.io.File JavaDoc;
26 import java.io.IOException JavaDoc;
27 import java.lang.reflect.Constructor JavaDoc;
28 import java.util.ArrayList JavaDoc;
29 import java.util.Iterator JavaDoc;
30 import java.util.List JavaDoc;
31 import java.util.logging.ConsoleHandler JavaDoc;
32 import java.util.logging.Handler JavaDoc;
33 import java.util.logging.Logger JavaDoc;
34
35 import javax.management.Attribute JavaDoc;
36
37 import org.apache.commons.cli.CommandLine;
38 import org.apache.commons.cli.HelpFormatter;
39 import org.apache.commons.cli.Option;
40 import org.apache.commons.cli.Options;
41 import org.apache.commons.cli.PosixParser;
42 import org.apache.commons.httpclient.Header;
43 import org.apache.commons.httpclient.HttpMethodBase;
44 import org.apache.commons.httpclient.URIException;
45 import org.archive.crawler.datamodel.CoreAttributeConstants;
46 import org.archive.crawler.datamodel.CrawlOrder;
47 import org.archive.crawler.datamodel.CrawlURI;
48 import org.archive.crawler.framework.Processor;
49 import org.archive.crawler.settings.CrawlerSettings;
50 import org.archive.crawler.settings.MapType;
51 import org.archive.crawler.settings.SettingsHandler;
52 import org.archive.crawler.settings.XMLSettingsHandler;
53 import org.archive.io.arc.ARCReader;
54 import org.archive.io.arc.ARCReaderFactory;
55 import org.archive.io.arc.ARCRecord;
56 import org.archive.net.UURIFactory;
57 import org.archive.util.HttpRecorder;
58 import org.archive.util.OneLineSimpleLogger;
59
60 /**
61  * Run named extractors against passed ARC file.
62  * This extractor tool runs suboptimally. It takes each ARC file record,
63  * writes it to a new scratch file, and then it runs each listed
64  * extractor against the scratch. It works in this manner because
65  * extractors want CharSequence, being able to refer to characters
66  * by absolute position, but ARCs are compressed streams. The work
67  * to get a CharSequence on an underlying compressed stream has not
68  * been done. Other issues are need to setup CrawlerSetting environment
69  * so extractors can run.
70  * @author stack
71  * @version $Date: 2007/01/13 01:31:16 $, $Revision: 1.4.16.1 $
72  */

73 public class ExtractorTool {
74 // private static final Logger logger =
75
// Logger.getLogger(ExtractorTool.class.getName());
76
static {
77         // Setup the oneline logger.
78
Handler JavaDoc [] hs = Logger.getLogger("").getHandlers();
79         for (int i = 0; i < hs.length; i++) {
80             Handler JavaDoc h = hs[0];
81             if (h instanceof ConsoleHandler JavaDoc) {
82                 h.setFormatter(new OneLineSimpleLogger());
83             }
84         }
85     }
86     
87     private static final String JavaDoc [] DEFAULT_EXTRACTORS =
88         {"org.archive.crawler.extractor.ExtractorHTTP",
89             "org.archive.crawler.extractor.ExtractorHTML"};
90     private final List JavaDoc<Processor> extractors;
91     private final File JavaDoc scratchDir;
92     private static final String JavaDoc DEFAULT_SCRATCH = "/tmp";
93     
94     public ExtractorTool()
95     throws Exception JavaDoc {
96         this(DEFAULT_EXTRACTORS, DEFAULT_SCRATCH);
97     }
98     
99     public ExtractorTool(String JavaDoc [] e, String JavaDoc scratch)
100     throws Exception JavaDoc {
101         super();
102         // Setup the scratch directory.
103
this.scratchDir = scratch == null?
104             new File JavaDoc(DEFAULT_SCRATCH): new File JavaDoc(scratch);
105         if (!this.scratchDir.exists()) {
106             this.scratchDir.mkdirs();
107         }
108         // Set up settings system. Needed by extractors.
109
File JavaDoc orderFile = new File JavaDoc(this.scratchDir.getAbsolutePath(),
110             ExtractorTool.class.getName() + "_order.xml");
111         SettingsHandler settingsHandler = new XMLSettingsHandler(orderFile);
112         settingsHandler.initialize();
113         settingsHandler.getOrder().
114             setAttribute(new Attribute JavaDoc(CrawlOrder.ATTR_SETTINGS_DIRECTORY,
115                 this.scratchDir.getAbsolutePath()));
116         CrawlerSettings globalSettings =
117             settingsHandler.getSettingsObject(null);
118         MapType extractorsSettings = (MapType)settingsHandler.getOrder().
119             getAttribute(CrawlOrder.ATTR_EXTRACT_PROCESSORS);
120         this.extractors = new ArrayList JavaDoc<Processor>();
121         for (int i = 0; i < e.length; i++) {
122             Constructor JavaDoc c = Class.forName(e[i]).
123                 getConstructor(new Class JavaDoc [] {String JavaDoc.class});
124             String JavaDoc name = Integer.toString(i);
125             Processor p = (Processor)c.newInstance(new Object JavaDoc [] {name});
126             extractorsSettings.addElement(globalSettings, p);
127             p.setAttribute(
128                 new Attribute JavaDoc(Processor.ATTR_ENABLED, Boolean.TRUE));
129             this.extractors.add(p);
130         }
131     }
132     
133     public void extract(String JavaDoc resource) throws IOException JavaDoc,
134     URIException, InterruptedException JavaDoc {
135         ARCReader reader = ARCReaderFactory.get(new File JavaDoc(resource));
136         for (Iterator JavaDoc i = reader.iterator(); i.hasNext();) {
137             ARCRecord ar = (ARCRecord)i.next();
138             HttpRecorder hr = HttpRecorder.
139                 wrapInputStreamWithHttpRecord(this.scratchDir,
140                     this.getClass().getName(), ar, null);
141             CrawlURI curi = getCrawlURI(ar, hr);
142             for (Iterator JavaDoc ii = this.extractors.iterator(); ii.hasNext();) {
143                 ((Processor)ii.next()).process(curi);
144             }
145             outlinks(curi);
146         }
147     }
148     
149     protected void outlinks(CrawlURI curi) {
150         System.out.println(curi.getUURI().toString());
151         for(Link l: curi.getOutLinks()) {
152             System.out.println(" " + l.getDestination() + " " +
153                 l.getHopType() + " " + l.getContext());
154         }
155     }
156     
157     protected CrawlURI getCrawlURI(final ARCRecord record,
158             final HttpRecorder hr)
159     throws URIException {
160         CrawlURI curi = new CrawlURI(UURIFactory.
161             getInstance(record.getMetaData().getUrl()));
162         curi.setContentSize(record.getMetaData().getLength());
163         curi.setContentType(record.getMetaData().getMimetype());
164         curi.setHttpRecorder(hr);
165         // Fake out the extractor that this is a legit HTTP transaction.
166
if (!curi.getUURI().getScheme().equals("filedesc")) {
167             curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
168                 new HttpMethodBase() {
169                     public String JavaDoc getName() {
170                         return this.getClass().getName() + "_method";
171                     }
172
173                     public Header getResponseHeader(String JavaDoc headerName) {
174                         String JavaDoc value = (String JavaDoc)record.getMetaData().
175                             getHeaderValue(headerName);
176                         return (value == null || value.length() == 0)?
177                             null: new Header(headerName, value);
178                     }
179             });
180             String JavaDoc statusCode = record.getMetaData().getStatusCode();
181             curi.setFetchStatus(statusCode == null?
182                 200: Integer.parseInt(statusCode));
183         }
184         return curi;
185     }
186     
187     /**
188      * Format usage message.
189      * @param formatter Help formatter instance.
190      * @param options Usage options.
191      * @param exitCode Exit code.
192      */

193     private static void usage(HelpFormatter formatter, Options options,
194             int exitCode) {
195         formatter.printHelp("java " + ExtractorTool.class.getName() +
196             " \\\n[--scratch=DIR] [--extractor=EXTRACTOR1,EXTRACTOR2,...] ARC", options);
197         System.exit(exitCode);
198     }
199     
200     public static void main(String JavaDoc[] args)
201     throws Exception JavaDoc {
202         Options options = new Options();
203         options.addOption(new Option("h", "help", false,
204             "Prints this message and exits."));
205         StringBuffer JavaDoc defaultExtractors = new StringBuffer JavaDoc();
206         for (int i = 0; i < DEFAULT_EXTRACTORS.length; i++) {
207             if (i > 0) {
208                 defaultExtractors.append(", ");
209             }
210             defaultExtractors.append(DEFAULT_EXTRACTORS[i]);
211         }
212         options.addOption(new Option("e", "extractor", true,
213             "List of comma-separated extractor class names. " +
214             "Run in order listed. " +
215             "If no extractors listed, runs following: " +
216             defaultExtractors.toString() + "."));
217         options.addOption(new Option("s", "scratch", true,
218             "Directory to write scratch files to. Default: '/tmp'."));
219         PosixParser parser = new PosixParser();
220         CommandLine cmdline = parser.parse(options, args, false);
221         List JavaDoc cmdlineArgs = cmdline.getArgList();
222         Option [] cmdlineOptions = cmdline.getOptions();
223         HelpFormatter formatter = new HelpFormatter();
224         // If no args, print help.
225
if (cmdlineArgs.size() <= 0) {
226             usage(formatter, options, 0);
227         }
228
229         // Now look at options passed.
230
String JavaDoc [] extractors = DEFAULT_EXTRACTORS;
231         String JavaDoc scratch = null;
232         for (int i = 0; i < cmdlineOptions.length; i++) {
233             switch(cmdlineOptions[i].getId()) {
234                 case 'h':
235                     usage(formatter, options, 0);
236                     break;
237
238                 case 'e':
239                     String JavaDoc value = cmdlineOptions[i].getValue();
240                     if (value == null || value.length() <= 0) {
241                         // Allow saying NO extractors so we can see
242
// how much it costs just reading through
243
// ARCs.
244
extractors = new String JavaDoc [0];
245                     } else {
246                         extractors = value.split(",");
247                     }
248                     break;
249                     
250                 case 's':
251                     scratch = cmdlineOptions[i].getValue();
252                     break;
253                   
254                 default:
255                     throw new RuntimeException JavaDoc("Unexpected option: " +
256                         + cmdlineOptions[i].getId());
257             }
258         }
259         
260         ExtractorTool tool = new ExtractorTool(extractors, scratch);
261         for (Iterator JavaDoc i = cmdlineArgs.iterator(); i.hasNext();) {
262             tool.extract((String JavaDoc)i.next());
263         }
264     }
265 }
266
Popular Tags