KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > postprocessor > ContentBasedWaitEvaluator


1 /* ContentBasedWaitEvaluator
2  *
3  * $Id: ContentBasedWaitEvaluator.java,v 1.4.18.1 2007/01/13 01:31:24 stack-sf Exp $
4  *
5  * Created on 1.4.2005
6  *
7  * Copyright (C) 2005 Kristinn Sigurdsson
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.postprocessor;
26
27 import javax.management.AttributeNotFoundException JavaDoc;
28
29 import org.archive.crawler.datamodel.CrawlURI;
30 import org.archive.crawler.settings.SimpleType;
31 import org.archive.util.TextUtils;
32
33 /**
34  * A WaitEvaluator that compares the CrawlURIs content type to a configurable
35  * regular expression. If it matches, then the wait evaluation is performed.
36  * Otherwise the processor passes on the CrawlURI, doing nothing.
37  *
38  * @author Kristinn Sigurdsson
39  *
40  * @see org.archive.crawler.postprocessor.WaitEvaluator
41  */

42 public class ContentBasedWaitEvaluator extends WaitEvaluator {
43     
44     private static final long serialVersionUID = 1623347208782997347L;
45
46     /** The regular expression that we limit this evaluator to. */
47     public final static String JavaDoc ATTR_CONTENT_REGEXPR =
48         "content-regular-expression";
49     protected final static String JavaDoc DEFAULT_CONTENT_REGEXPR = "^.*$"; //Everything
50

51     /**
52      * Constructor
53      *
54      * @param name The name of the module
55      */

56     public ContentBasedWaitEvaluator(String JavaDoc name) {
57         this(name,"Evaluates how long to wait before fetching a URI again. " +
58                 "Only handles CrawlURIs whose content type matches the " +
59                 "regular expression set. " +
60                 "Typically, this processor should be in the post processing " +
61                 "chain. It will pass if another wait evaluator has already " +
62                 "processed the CrawlURI.", DEFAULT_CONTENT_REGEXPR,
63                 DEFAULT_INITIAL_WAIT_INTERVAL,
64                 DEFAULT_MAX_WAIT_INTERVAL,
65                 DEFAULT_MIN_WAIT_INTERVAL,
66                 DEFAULT_UNCHANGED_FACTOR,
67                 DEFAULT_CHANGED_FACTOR);
68     }
69
70     /**
71      * Constructor
72      *
73      * @param name The name of the module
74      * @param description Description of the module
75      * @param default_inital_wait_interval The default value for initial wait
76      * time
77      * @param default_max_wait_interval The maximum value for wait time
78      * @param default_min_wait_interval The minimum value for wait time
79      * @param default_unchanged_factor The factor for changing wait times of
80      * unchanged documents (will be multiplied by this value)
81      * @param default_changed_factor The factor for changing wait times of
82      * changed documents (will be divided by this value)
83      */

84     public ContentBasedWaitEvaluator(String JavaDoc name, String JavaDoc description,
85             String JavaDoc defaultRegExpr,
86             Long JavaDoc default_inital_wait_interval,
87             Long JavaDoc default_max_wait_interval,
88             Long JavaDoc default_min_wait_interval,
89             Double JavaDoc default_unchanged_factor,
90             Double JavaDoc default_changed_factor){
91         super(name,description,
92                 default_inital_wait_interval,
93                 default_max_wait_interval,
94                 default_min_wait_interval,
95                 default_unchanged_factor,
96                 default_changed_factor);
97
98         addElementToDefinition(new SimpleType(ATTR_CONTENT_REGEXPR,
99                 "Only URIs whose content type matches this regular " +
100                 "expression will be evaluated.",
101                 defaultRegExpr));
102
103     }
104     
105     protected void innerProcess(CrawlURI curi) throws InterruptedException JavaDoc {
106         // Check if content type is available and if it matches the reg.expr.
107
String JavaDoc content_type = curi.getContentType();
108         if(content_type==null){
109             // No content type, exit
110
return;
111         }
112         String JavaDoc regexpr;
113         try {
114             regexpr = (String JavaDoc)getAttribute(curi,ATTR_CONTENT_REGEXPR);
115         } catch (AttributeNotFoundException JavaDoc e) {
116             logger.warning("Regular expression for content type not found");
117             return;
118         }
119
120         if(TextUtils.matches(regexpr, content_type) == false){
121             // Content type does not match reg.expr. Exit
122
return;
123         }
124         // Ok, it matches, invoke parent method.
125

126         super.innerProcess(curi);
127     }
128 }
129
Popular Tags