KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > Extractor


1 /* Extractor
2 *
3 * $Id: Extractor.java,v 1.4 2006/08/15 01:31:34 stack-sf Exp $
4 *
5 * Created on Sep 22, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.extractor;
26
27 import java.util.logging.Level JavaDoc;
28 import java.util.logging.Logger JavaDoc;
29
30 import org.archive.crawler.datamodel.CrawlURI;
31 import org.archive.crawler.framework.Processor;
32
33 /**
34  * Convenience shared superclass for Extractor Processors.
35  *
36  * Currently only wraps Extractor-specific extract() action with
37  * a StackOverflowError catch/log/proceed handler, so that any
38  * extractors that recurse too deep on problematic input will
39  * only suffer a local error, and other normal CrawlURI processing
40  * can continue. See:
41  * [ 1122836 ] Localize StackOverflowError in Extractors
42  * http://sourceforge.net/tracker/index.php?func=detail&aid=1122836&group_id=73833&atid=539099
43  *
44  * This class could also become home to common utility features
45  * of extractors, like a running tally of the URIs examined/discovered,
46  * etc.
47  *
48  * @author gojomo
49  */

50 public abstract class Extractor extends Processor {
51     private static final Logger JavaDoc logger = Logger
52         .getLogger(Extractor.class.getName());
53
54     /**
55      * Passthrough constructor.
56      *
57      * @param name
58      * @param description
59      */

60     public Extractor(String JavaDoc name, String JavaDoc description) {
61         super(name, description);
62         // TODO Auto-generated constructor stub
63
}
64
65     public void innerProcess(CrawlURI curi) {
66         try {
67             extract(curi);
68         } catch (NullPointerException JavaDoc npe) {
69             // both annotate (to highlight in crawl log) & add as local-error
70
curi.addAnnotation("err=" + npe.getClass().getName());
71             curi.addLocalizedError(getName(), npe, "");
72             // also log as warning
73
logger.log(Level.WARNING, getName() + ": NullPointerException",
74                 npe);
75         } catch (StackOverflowError JavaDoc soe) {
76             // both annotate (to highlight in crawl log) & add as local-error
77
curi.addAnnotation("err=" + soe.getClass().getName());
78             curi.addLocalizedError(getName(), soe, "");
79             // also log as warning
80
logger.log(Level.WARNING, getName() + ": StackOverflowError", soe);
81         } catch (java.nio.charset.CoderMalfunctionError JavaDoc cme) {
82             // See http://sourceforge.net/tracker/index.php?func=detail&aid=1540222&group_id=73833&atid=539099
83
// Both annotate (to highlight in crawl log) & add as local-error
84
curi.addAnnotation("err=" + cme.getClass().getName());
85             curi.addLocalizedError(getName(), cme, ""); // <-- Message field ignored when logging.
86
logger.log(Level.WARNING, getName() + ": CoderMalfunctionError",
87                 cme);
88         }
89     }
90
91     protected abstract void extract(CrawlURI curi);
92 }
93
Popular Tags