1 24 package org.archive.crawler.filter; 25 26 import javax.management.AttributeNotFoundException ; 27 28 import org.archive.crawler.datamodel.CandidateURI; 29 import org.archive.crawler.deciderules.DecideRule; 30 import org.archive.crawler.deciderules.DecidingFilter; 31 import org.archive.crawler.extractor.Link; 32 import org.archive.crawler.framework.CrawlScope; 33 import org.archive.crawler.framework.Filter; 34 import org.archive.crawler.scope.ClassicScope; 35 import org.archive.crawler.settings.SimpleType; 36 37 46 public class TransclusionFilter extends Filter { 47 48 private static final long serialVersionUID = 4251767672778714051L; 49 50 private static final String ATTR_MAX_SPECULATIVE_HOPS = 51 "max-speculative-hops"; 52 private static final String ATTR_MAX_REFERRAL_HOPS = "max-referral-hops"; 53 private static final String ATTR_MAX_EMBED_HOPS = "max-embed-hops"; 54 private static final int DEFAULT_MAX_TRANS_HOPS = 4; 55 56 61 private static final int DEFAULT_MAX_SPECULATIVE_HOPS = 1; 62 63 68 private static final int DEFAULT_MAX_REFERRAL_HOPS = -1; 69 70 75 private static final int DEFAULT_MAX_EMBED_HOPS = -1; 76 77 int maxTransHops = DEFAULT_MAX_TRANS_HOPS; 78 int maxSpeculativeHops = DEFAULT_MAX_SPECULATIVE_HOPS; 79 int maxReferralHops = DEFAULT_MAX_REFERRAL_HOPS; 80 int maxEmbedHops = DEFAULT_MAX_EMBED_HOPS; 81 82 85 88 public TransclusionFilter(String name) { 89 super(name, "Transclusion filter *Deprecated* Use" + 90 "DecidingFilter and equivalent DecideRule instead."); 91 92 addElementToDefinition( 93 new SimpleType( 94 ATTR_MAX_SPECULATIVE_HOPS, 95 "Maximum number of consecutive speculative (i.e. URIs" + 96 " extracted that we are not sure if they are embeds or" + 97 " not) hops to allow.\nA value of -1 means no upper limit.", 98 new Integer (DEFAULT_MAX_SPECULATIVE_HOPS))); 99 addElementToDefinition( 100 new SimpleType( 101 ATTR_MAX_REFERRAL_HOPS, 102 "Maximum number of consecutive referral hops to allow.\n" + 103 "A value of -1 means no upper limit.", 104 new Integer (DEFAULT_MAX_REFERRAL_HOPS))); 105 addElementToDefinition( 106 new SimpleType( 107 ATTR_MAX_EMBED_HOPS, 108 "Maximum number of consecutive embed hops to allow.\n" + 109 "A value of -1 means no upper limit.", 110 new Integer (DEFAULT_MAX_EMBED_HOPS))); 111 } 112 113 116 protected boolean innerAccepts(Object o) { 117 if(! (o instanceof CandidateURI)) { 118 return false; 119 } 120 String path = ((CandidateURI)o).getPathFromSeed(); 121 int transCount = 0; 122 int specCount = 0; 123 int refCount = 0; 124 int embedCount = 0; 125 loop: for(int i=path.length()-1;i>=0;i--) { 126 switch (path.charAt(i)) { 128 case Link.NAVLINK_HOP: { 129 break loop; 130 } 131 case Link.PREREQ_HOP: { 132 if(transCount==0) { 133 transCount++; 135 break loop; 136 } 137 break; 139 } 140 case Link.SPECULATIVE_HOP: { 141 specCount++; 142 break; 143 } 144 case Link.REFER_HOP: { 145 refCount++; 146 break; 147 } 148 case Link.EMBED_HOP: { 149 embedCount++; 150 break; 151 } 152 } 155 transCount++; 156 } 157 158 readMaxValues(o); 159 160 return (transCount > 0) 162 && (transCount <= this.maxTransHops) 164 && (this.maxSpeculativeHops < 0 || specCount <= this.maxSpeculativeHops) 166 && (this.maxReferralHops < 0 || refCount <= this.maxReferralHops) 168 && (this.maxEmbedHops < 0 || embedCount <= this.maxEmbedHops); 170 } 171 172 public void readMaxValues(Object o) { 173 try { 174 CrawlScope scope = 175 (CrawlScope) globalSettings().getModule(CrawlScope.ATTR_NAME); 176 this.maxTransHops = ((Integer ) scope.getAttribute(o, ClassicScope.ATTR_MAX_TRANS_HOPS)).intValue(); 177 this.maxSpeculativeHops = ((Integer ) getAttribute(o, ATTR_MAX_SPECULATIVE_HOPS)).intValue(); 178 this.maxReferralHops = ((Integer ) getAttribute(o, ATTR_MAX_REFERRAL_HOPS)).intValue(); 179 this.maxEmbedHops = ((Integer ) getAttribute(o, ATTR_MAX_EMBED_HOPS)).intValue(); 180 } catch (AttributeNotFoundException e) { 181 e.printStackTrace(); 183 } 184 } 185 186 } 187 | Popular Tags |