KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > deciderules > NotExceedsDocumentLengthTresholdDecideRule


1 /* $Id: NotExceedsDocumentLengthTresholdDecideRule.java,v 1.2.2.1 2007/01/13 01:31:14 stack-sf Exp $
2  *
3  * Created on 28.8.2006
4  *
5  * Copyright (C) 2006 Olaf Freyer
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.deciderules;
24
25 import java.util.logging.Level JavaDoc;
26 import java.util.logging.Logger JavaDoc;
27
28 import org.apache.commons.httpclient.HttpMethod;
29 import org.archive.crawler.datamodel.CoreAttributeConstants;
30 import org.archive.crawler.datamodel.CrawlURI;
31 import org.archive.crawler.settings.SimpleType;
32
33 public class NotExceedsDocumentLengthTresholdDecideRule
34 extends PredicatedDecideRule implements CoreAttributeConstants {
35     
36
37     private static final long serialVersionUID = -8774160016195991876L;
38
39     private static final Logger JavaDoc logger = Logger.
40         getLogger(NotExceedsDocumentLengthTresholdDecideRule.class.getName());
41     public static final String JavaDoc ATTR_CONTENT_LENGTH_TRESHOLD =
42         "content-length-treshold";
43     static final Integer JavaDoc DEFAULT_CONTENT_LENGTH_TRESHOLD = -1;
44     public static final String JavaDoc ATTR_USE_AS_MIDFETCH = "use-as-midfetch-filter";
45     static final Boolean JavaDoc DEFAULT_USE_AS_MIDFETCH = new Boolean JavaDoc(true);
46     
47     
48     // Header predictor state constants
49
public static final int HEADER_PREDICTS_MISSING = -1;
50     
51     public NotExceedsDocumentLengthTresholdDecideRule(String JavaDoc name){
52         super(name);
53         setDescription("NotExceedsDocumentLengthTresholdDecideRule. " +
54                 "REJECTs URIs "+
55                 "with content length exceeding a given treshold. "+
56                 "Either examines HTTP header content length or " +
57                 "actual downloaded content length and returns false " +
58                 "for documents exceeding a given length treshold.");
59         
60         addElementToDefinition(new SimpleType(ATTR_USE_AS_MIDFETCH,
61                 "Shall this rule be used as a midfetch rule? If true, " +
62                 "this rule will determine content length based on HTTP " +
63                 "header information, otherwise the size of the already " +
64                 "downloaded content will be used.",
65                 DEFAULT_USE_AS_MIDFETCH));
66
67         addElementToDefinition(new SimpleType(ATTR_CONTENT_LENGTH_TRESHOLD,
68             "Max " +
69             "content-length this filter will allow to pass through. If -1, " +
70             "then no limit.",
71             DEFAULT_CONTENT_LENGTH_TRESHOLD));
72     }
73     
74     protected boolean evaluate(Object JavaDoc object) {
75         try {
76             CrawlURI curi = (CrawlURI)object;
77             
78             int contentlength = HEADER_PREDICTS_MISSING;
79
80             //filter used as midfetch filter
81
if (getIsMidfetchRule(object)){
82                 
83                     if(curi.containsKey(A_HTTP_TRANSACTION) == false){
84                         // Missing header info, let pass
85
if (logger.isLoggable(Level.INFO)) {
86                             logger.info("Error: Missing HttpMethod object in " +
87                                 "CrawlURI. " + curi.toString());
88                         }
89                         return false;
90                     }
91                 
92                     // Initially assume header info is missing
93
HttpMethod method =
94                         (HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
95
96                     // get content-length
97
String JavaDoc newContentlength = null;
98                     if (method.getResponseHeader("content-length") != null) {
99                         newContentlength = method.
100                             getResponseHeader("content-length").getValue();
101                     }
102                 
103                     if (newContentlength != null &&
104                             newContentlength.length() > 0) {
105                         try {
106                             contentlength = Integer.parseInt(newContentlength);
107                         } catch (NumberFormatException JavaDoc nfe) {
108                             // Ignore.
109
}
110                     }
111                 
112                     // If no document length was reported or format was wrong,
113
// let pass
114
if (contentlength == HEADER_PREDICTS_MISSING) {
115                         return false;
116                     }
117             } else {
118                 contentlength = (int)curi.getContentSize();
119             }
120
121             return makeDecision(contentlength, object);
122                 
123         } catch (ClassCastException JavaDoc e) {
124             // if not CrawlURI, always disregard
125
return false;
126         }
127     }
128     
129     /**
130      * @param contentLength content length to check against treshold
131      * @param obj Context object.
132      * @return contentLength not exceeding treshold?
133      */

134     protected Boolean JavaDoc makeDecision(int contentLength, Object JavaDoc obj) {
135         return contentLength < getContentLengthTreshold(obj);
136     }
137     
138     /**
139      * @param obj Context object.
140      * @return content length threshold
141      */

142     protected int getContentLengthTreshold(Object JavaDoc obj) {
143         int len = ((Integer JavaDoc)getUncheckedAttribute(obj,
144             ATTR_CONTENT_LENGTH_TRESHOLD)).intValue();
145         return len == -1? Integer.MAX_VALUE: len;
146     }
147
148     /**
149      * @param obj Context object.
150      * @return to be used as midfetch rule?
151      */

152     private Boolean JavaDoc getIsMidfetchRule(Object JavaDoc obj) {
153         return ((Boolean JavaDoc)getUncheckedAttribute(obj,ATTR_USE_AS_MIDFETCH)).
154             booleanValue();
155     }
156 }
Popular Tags