KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > filter > TransclusionFilter


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * TransclusionFilter.java
20  * Created on Oct 3, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/filter/TransclusionFilter.java,v 1.21.2.1 2007/01/13 01:31:21 stack-sf Exp $
23  */

24 package org.archive.crawler.filter;
25
26 import javax.management.AttributeNotFoundException JavaDoc;
27
28 import org.archive.crawler.datamodel.CandidateURI;
29 import org.archive.crawler.deciderules.DecideRule;
30 import org.archive.crawler.deciderules.DecidingFilter;
31 import org.archive.crawler.extractor.Link;
32 import org.archive.crawler.framework.CrawlScope;
33 import org.archive.crawler.framework.Filter;
34 import org.archive.crawler.scope.ClassicScope;
35 import org.archive.crawler.settings.SimpleType;
36
37 /**
38  * Filter which accepts CandidateURI/CrawlURI instances which contain more
39  * than zero but fewer than max-trans-hops entries at the end of their
40  * discovery path.
41  *
42  * @author Gordon Mohr
43  * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
44  * equivalent {@link DecideRule}.
45  */

46 public class TransclusionFilter extends Filter {
47
48     private static final long serialVersionUID = 4251767672778714051L;
49
50     private static final String JavaDoc ATTR_MAX_SPECULATIVE_HOPS =
51         "max-speculative-hops";
52     private static final String JavaDoc ATTR_MAX_REFERRAL_HOPS = "max-referral-hops";
53     private static final String JavaDoc ATTR_MAX_EMBED_HOPS = "max-embed-hops";
54     private static final int DEFAULT_MAX_TRANS_HOPS = 4;
55
56     /**
57      * Default speculative hops.
58      *
59      * No more than 1
60      */

61     private static final int DEFAULT_MAX_SPECULATIVE_HOPS = 1;
62
63     /**
64      * Default maximum referral hops.
65      *
66      * No limit beside the overall trans limit
67      */

68     private static final int DEFAULT_MAX_REFERRAL_HOPS = -1;
69
70     /**
71      * Default embedded link hops.
72      *
73      * No limit beside the overall trans limit
74      */

75     private static final int DEFAULT_MAX_EMBED_HOPS = -1;
76
77     int maxTransHops = DEFAULT_MAX_TRANS_HOPS;
78     int maxSpeculativeHops = DEFAULT_MAX_SPECULATIVE_HOPS;
79     int maxReferralHops = DEFAULT_MAX_REFERRAL_HOPS;
80     int maxEmbedHops = DEFAULT_MAX_EMBED_HOPS;
81
82 // // 1-3 trailing P(recondition)/R(eferral)/E(mbed)/X(speculative-embed) hops
83
// private static final String TRANSCLUSION_PATH = ".*[PREX][PREX]?[PREX]?$";
84

85     /**
86      * @param name
87      */

88     public TransclusionFilter(String JavaDoc name) {
89         super(name, "Transclusion filter *Deprecated* Use" +
90                 "DecidingFilter and equivalent DecideRule instead.");
91
92         addElementToDefinition(
93             new SimpleType(
94                 ATTR_MAX_SPECULATIVE_HOPS,
95                 "Maximum number of consecutive speculative (i.e. URIs" +
96                 " extracted that we are not sure if they are embeds or" +
97                 " not) hops to allow.\nA value of -1 means no upper limit.",
98                 new Integer JavaDoc(DEFAULT_MAX_SPECULATIVE_HOPS)));
99         addElementToDefinition(
100             new SimpleType(
101                 ATTR_MAX_REFERRAL_HOPS,
102                 "Maximum number of consecutive referral hops to allow.\n" +
103                 "A value of -1 means no upper limit.",
104                 new Integer JavaDoc(DEFAULT_MAX_REFERRAL_HOPS)));
105         addElementToDefinition(
106             new SimpleType(
107                 ATTR_MAX_EMBED_HOPS,
108                 "Maximum number of consecutive embed hops to allow.\n" +
109                 "A value of -1 means no upper limit.",
110                 new Integer JavaDoc(DEFAULT_MAX_EMBED_HOPS)));
111     }
112
113     /* (non-Javadoc)
114      * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object)
115      */

116     protected boolean innerAccepts(Object JavaDoc o) {
117         if(! (o instanceof CandidateURI)) {
118             return false;
119         }
120         String JavaDoc path = ((CandidateURI)o).getPathFromSeed();
121         int transCount = 0;
122         int specCount = 0;
123         int refCount = 0;
124         int embedCount = 0;
125         loop: for(int i=path.length()-1;i>=0;i--) {
126             // everything except 'L' is considered transitive
127
switch (path.charAt(i)) {
128                 case Link.NAVLINK_HOP: {
129                     break loop;
130                 }
131                 case Link.PREREQ_HOP: {
132                     if(transCount==0) {
133                         // always consider a trailing P as a 1-hop trans inclusion; disregard previous hops
134
transCount++;
135                         break loop;
136                     }
137                     // otherwise, just count as another regular trans hop
138
break;
139                 }
140                 case Link.SPECULATIVE_HOP: {
141                     specCount++;
142                     break;
143                 }
144                 case Link.REFER_HOP: {
145                     refCount++;
146                     break;
147                 }
148                 case Link.EMBED_HOP: {
149                     embedCount++;
150                     break;
151                 }
152                 // FIXME: what is 'D'?
153
// 'D's get a free pass
154
}
155             transCount++;
156         }
157
158         readMaxValues(o);
159
160         // This is a case of possible transclusion
161
return (transCount > 0)
162             // ...and the overall number of hops isn't too high
163
&& (transCount <= this.maxTransHops)
164             // ...and the number of spec-hops isn't too high
165
&& (this.maxSpeculativeHops < 0 || specCount <= this.maxSpeculativeHops)
166             // ...and the number of referral-hops isn't too high
167
&& (this.maxReferralHops < 0 || refCount <= this.maxReferralHops)
168             // ...and the number of embed-hops isn't too high
169
&& (this.maxEmbedHops < 0 || embedCount <= this.maxEmbedHops);
170     }
171
172     public void readMaxValues(Object JavaDoc o) {
173         try {
174             CrawlScope scope =
175                 (CrawlScope) globalSettings().getModule(CrawlScope.ATTR_NAME);
176             this.maxTransHops = ((Integer JavaDoc) scope.getAttribute(o, ClassicScope.ATTR_MAX_TRANS_HOPS)).intValue();
177             this.maxSpeculativeHops = ((Integer JavaDoc) getAttribute(o, ATTR_MAX_SPECULATIVE_HOPS)).intValue();
178             this.maxReferralHops = ((Integer JavaDoc) getAttribute(o, ATTR_MAX_REFERRAL_HOPS)).intValue();
179             this.maxEmbedHops = ((Integer JavaDoc) getAttribute(o, ATTR_MAX_EMBED_HOPS)).intValue();
180         } catch (AttributeNotFoundException JavaDoc e) {
181             // TODO Auto-generated catch block
182
e.printStackTrace();
183         }
184     }
185
186 }
187
Popular Tags