KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > framework > Processor


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * Processor.java
20  * Created on Apr 16, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/framework/Processor.java,v 1.32.2.1 2007/01/13 01:31:22 stack-sf Exp $
23  */

24 package org.archive.crawler.framework;
25
26 import java.lang.reflect.Constructor JavaDoc;
27 import java.util.Iterator JavaDoc;
28 import java.util.logging.Level JavaDoc;
29 import java.util.logging.Logger JavaDoc;
30
31 import javax.management.AttributeNotFoundException JavaDoc;
32
33 import org.archive.crawler.datamodel.CrawlURI;
34 import org.archive.crawler.settings.MapType;
35 import org.archive.crawler.settings.ModuleType;
36 import org.archive.crawler.settings.SimpleType;
37
38 /**
39  * Base class for URI processing classes.
40  *
41  * <p> Each URI is processed by a user defined series of processors. This class
42  * provides the basic infrastructure for these but does not actually do
43  * anything. New processors can be easily created by subclassing this class.
44  *
45  * <p> Classes subclassing this one should not trap InterruptedExceptions.
46  * They should be allowed to propagate to the ToeThread executing the processor.
47  * Also they should immediately exit their main method (<tt>innerProcess()</tt>)
48  * if the <tt>interrupted</tt> flag is set.
49  *
50  * @author Gordon Mohr
51  *
52  * @see org.archive.crawler.framework.ToeThread
53  */

54 public class Processor extends ModuleType {
55
56     private static final long serialVersionUID = 6248563827413710226L;
57
58     /**
59      * Key to use asking settings for filters value.
60      */

61     public final static String JavaDoc ATTR_FILTERS = "filters";
62
63     /**
64      * Key to use asking settings for enabled value.
65      */

66     public final static String JavaDoc ATTR_ENABLED = "enabled";
67
68     private MapType filters;
69     private Processor defaultNextProcessor = null;
70
71     private static Logger JavaDoc logger =
72         Logger.getLogger("org.archive.crawler.framework.Processor");
73
74     /**
75      * @param name
76      * @param description
77      */

78     public Processor(String JavaDoc name, String JavaDoc description) {
79         super(name, description);
80         addElementToDefinition(new SimpleType(ATTR_ENABLED,
81             "Is processor enabled", new Boolean JavaDoc(true)));
82         filters = (MapType) addElementToDefinition(new MapType(ATTR_FILTERS,
83             "Filters applied to this processor", Filter.class));
84     }
85
86     /**
87      * Perform processing on the given CrawlURI.
88      *
89      * @param curi
90      * @throws InterruptedException
91      */

92     public final void process(CrawlURI curi) throws InterruptedException JavaDoc {
93         // by default, arrange for curi to proceed to next processor
94
curi.setNextProcessor(getDefaultNextProcessor(curi));
95
96         // Check if this processor is enabled before processing
97
try {
98             if (!((Boolean JavaDoc) getAttribute(ATTR_ENABLED, curi)).booleanValue()) {
99                 return;
100             }
101         } catch (AttributeNotFoundException JavaDoc e) {
102             logger.severe(e.getMessage());
103         }
104
105         if(filtersAccept(curi)) {
106             innerProcess(curi);
107         } else {
108             innerRejectProcess(curi);
109         }
110     }
111
112     protected void checkForInterrupt() throws InterruptedException JavaDoc {
113         if (Thread.interrupted()) {
114             throw new InterruptedException JavaDoc("interrupted");
115         }
116     }
117
118     /**
119      * @param curi CrawlURI instance.
120      * @throws InterruptedException
121      */

122     protected void innerRejectProcess(CrawlURI curi)
123     throws InterruptedException JavaDoc {
124         // by default do nothing
125
}
126
127     /**
128      * Classes subclassing this one should override this method to perform
129      * their custom actions on the CrawlURI.
130      *
131      * @param curi The CrawlURI being processed.
132      * @throws InterruptedException
133      */

134     protected void innerProcess(CrawlURI curi)
135     throws InterruptedException JavaDoc {
136         // by default do nothing
137
}
138
139     /**
140      * Classes subclassing this one should override this method to perform
141      * processor specific actions.
142      * <p>
143      *
144      * This method is garanteed to be called after the crawl is set up, but
145      * before any URI-processing has occured.
146      */

147     protected void initialTasks () {
148         // by default do nothing
149
}
150
151     /**
152      * Classes subclassing this one should override this method to perform
153      * processor specific actions.
154      *
155      */

156     protected void finalTasks () {
157         // by default do nothing
158
}
159
160     /**
161      * Do all specified filters (if any) accept this CrawlURI?
162      *
163      * @param curi
164      * @return True if all filters accept this CrawlURI.
165      */

166     protected boolean filtersAccept(CrawlURI curi) {
167         return filtersAccept(this.filters, curi);
168     }
169     
170     /**
171      * Do all specified filters (if any) accept this CrawlURI?
172      *
173      * @param curi
174      * @param fs Filters to process.
175      * @return True if all filters accept this CrawlURI.
176      */

177     protected boolean filtersAccept(MapType fs, CrawlURI curi) {
178         if (fs.isEmpty(curi)) {
179             return true;
180         }
181         for (Iterator JavaDoc i = fs.iterator(curi); i.hasNext();) {
182             Filter filter = (Filter)i.next();
183             if (!filter.accepts(curi)) {
184                 if (logger.isLoggable(Level.INFO)) {
185                     logger.info(filter + " rejected " + curi +
186                         " in Processor " + getName());
187                 }
188                 return false;
189             }
190         }
191         return true;
192     }
193
194     /**
195      * Returns the next processor for the given CrawlURI in the processor chain.
196      * @param curi The CrawlURI that we want to find the next processor for.
197      * @return The next processor for the given CrawlURI in the processor chain.
198      */

199     public Processor getDefaultNextProcessor(CrawlURI curi) {
200         return defaultNextProcessor;
201     }
202
203     /** Set the default next processor in the chain.
204      *
205      * @param nextProcessor the default next processor in the chain.
206      */

207     public void setDefaultNextProcessor(Processor nextProcessor) {
208         defaultNextProcessor = nextProcessor;
209     }
210
211     /**
212      * Get the controller object.
213      *
214      * @return the controller object.
215      */

216     public CrawlController getController() {
217         return getSettingsHandler().getOrder().getController();
218     }
219
220     public Processor spawn(int serialNum) {
221         Processor newInst = null;
222         try {
223             Constructor JavaDoc co =
224                 getClass().getConstructor(new Class JavaDoc[] { String JavaDoc.class });
225             newInst =
226                 (Processor) co.newInstance(new Object JavaDoc[] {
227                     getName() + serialNum
228                     });
229             getParent().setAttribute(newInst);
230             newInst.setTransient(true);
231         } catch (Exception JavaDoc e) {
232             // TODO Auto-generated catch block
233
e.printStackTrace();
234         }
235         return newInst;
236     }
237
238     /**
239      * Compiles and returns a report (in human readable form) about the status
240      * of the processor. The processor's name (of implementing class) should
241      * always be included.
242      * <p>
243      * Examples of stats declared would include:<br>
244      * * Number of CrawlURIs handled.<br>
245      * * Number of links extracted (for link extractors)<br>
246      * etc.
247      *
248      * @return A human readable report on the processor's state.
249      */

250     public String JavaDoc report(){
251         return ""; // Default behavior.
252
}
253     
254     /**
255      * @param curi CrawlURI to examine.
256      * @return True if content to process -- content length is > 0
257      * -- and links have not yet been extracted.
258      */

259     protected boolean isContentToProcess(CrawlURI curi) {
260         return !curi.hasBeenLinkExtracted() && curi.getContentLength() > 0;
261     }
262     
263     /**
264      * @param curi CrawlURI to examine.
265      * @return True if {@link #isContentToProcess(CrawlURI)} and
266      * the CrawlURI represents a successful http transaction.
267      */

268     protected boolean isHttpTransactionContentToProcess(CrawlURI curi) {
269         return isContentToProcess(curi) &&
270             curi.isHttpTransaction() &&
271             curi.isSuccess();
272     }
273     
274     /**
275      * @param contentType Found content type.
276      * @param expectedPrefix String to find at start of contenttype: e.g.
277      * <code>text/html</code>.
278      * @return True if passed content-type begins with
279      * expected mimetype.
280      */

281     protected boolean isExpectedMimeType(String JavaDoc contentType,
282             String JavaDoc expectedPrefix) {
283         return contentType != null &&
284             contentType.toLowerCase().startsWith(expectedPrefix);
285     }
286
287     public void kickUpdate() {
288         // by default do nothing
289
}
290 }
291
Popular Tags