KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > deciderules > SurtPrefixedDecideRule


1 /* SurtPrefixedDecideRule
2 *
3 * $Id: SurtPrefixedDecideRule.java,v 1.16.6.1 2007/01/13 01:31:15 stack-sf Exp $
4 *
5 * Created on Apr 5, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.deciderules;
26
27 import java.io.File JavaDoc;
28 import java.io.FileReader JavaDoc;
29 import java.io.FileWriter JavaDoc;
30 import java.io.IOException JavaDoc;
31
32 import org.archive.crawler.datamodel.CandidateURI;
33 import org.archive.crawler.framework.CrawlScope;
34 import org.archive.crawler.scope.SeedListener;
35 import org.archive.crawler.settings.SimpleType;
36 import org.archive.crawler.settings.Type;
37 import org.archive.util.SurtPrefixSet;
38
39
40
41 /**
42  * Rule applies configured decision to any URIs that, when
43  * expressed in SURT form, begin with one of the prefixes
44  * in the configured set.
45  *
46  * The set can be filled with SURT prefixes implied or
47  * listed in the seeds file, or another external file.
48  *
49  * The "also-check-via" option to implement "one hop off"
50  * scoping derives from a contribution by Shifra Raffel
51  * of the California Digital Library.
52  *
53  * @author gojomo
54  */

55 public class SurtPrefixedDecideRule extends PredicatedDecideRule
56         implements SeedListener {
57
58     private static final long serialVersionUID = 2075790126085405015L;
59
60     //private static final Logger logger =
61
// Logger.getLogger(SurtPrefixedDecideRule.class.getName());
62

63     public static final String JavaDoc ATTR_SURTS_SOURCE_FILE = "surts-source-file";
64     public static final String JavaDoc ATTR_SEEDS_AS_SURT_PREFIXES =
65         "seeds-as-surt-prefixes";
66     public static final String JavaDoc ATTR_SURTS_DUMP_FILE = "surts-dump-file";
67     
68     private static final Boolean JavaDoc DEFAULT_SEEDS_AS_SURT_PREFIXES =
69         new Boolean JavaDoc(true);
70
71     /**
72      * Whether every config change should trigger a
73      * rebuilding of the prefix set.
74      */

75     public static final String JavaDoc
76         ATTR_REBUILD_ON_RECONFIG = "rebuild-on-reconfig";
77     public static final Boolean JavaDoc
78         DEFAULT_REBUILD_ON_RECONFIG = Boolean.TRUE;
79     
80     /**
81      * Whether the 'via' of CrawlURIs should also be checked
82      * to see if it is prefixed by the set of SURT prefixes
83      */

84     public static final String JavaDoc
85         ATTR_ALSO_CHECK_VIA = "also-check-via";
86     public static final Boolean JavaDoc
87         DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
88     
89     protected SurtPrefixSet surtPrefixes = null;
90
91     /**
92      * Usual constructor.
93      * @param name
94      */

95     public SurtPrefixedDecideRule(String JavaDoc name) {
96         super(name);
97         setDescription("SurtPrefixedDecideRule. Makes the configured decision "
98                 + "for any URI which, when expressed in SURT form, begins "
99                 + "with any of the established prefixes (from either seeds "
100                 + "specification or an external file).");
101         addElementToDefinition(new SimpleType(ATTR_SURTS_SOURCE_FILE,
102                 "Source file from which to infer SURT prefixes. Any URLs " +
103                 "in file will be converted to the implied SURT prefix, and " +
104                 "literal SURT prefixes may be listed on lines beginning " +
105                 "with a '+' character.",
106                 ""));
107         addElementToDefinition(new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES,
108                 "Should seeds also be interpreted as SURT prefixes.",
109                 DEFAULT_SEEDS_AS_SURT_PREFIXES));
110         Type t = addElementToDefinition(new SimpleType(ATTR_SURTS_DUMP_FILE,
111                 "Dump file to save SURT prefixes actually used: " +
112                 "Useful debugging SURTs.", ""));
113         t.setExpertSetting(true);
114         t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,
115                 "Whether to also make the configured decision if a " +
116                 "URI's 'via' URI (the URI from which it was discovered) " +
117                 "in SURT form begins with any of the established prefixes. " +
118                 "For example, can be used to ACCEPT URIs that are 'one hop " +
119                 "off' URIs fitting the SURT prefixes. Default is false.",
120                 DEFAULT_ALSO_CHECK_VIA));
121         t.setOverrideable(false);
122         t.setExpertSetting(true);
123         t = addElementToDefinition(new SimpleType(ATTR_REBUILD_ON_RECONFIG,
124                 "Whether to rebuild the internal structures from source " +
125                 "files (including seeds if appropriate) every time any " +
126                 "configuration change occurs. If true, " +
127                 "rule is rebuilt from sources even when (for example) " +
128                 "unrelated new domain overrides are set. Rereading large" +
129                 "source files can take a long time.",
130                 DEFAULT_REBUILD_ON_RECONFIG));
131         t.setOverrideable(false);
132         t.setExpertSetting(true);
133     }
134
135     /**
136      * Evaluate whether given object's URI is covered by the SURT prefix set
137      *
138      * @param object Item to evaluate.
139      * @return true if item, as SURT form URI, is prefixed by an item in the set
140      */

141     protected boolean evaluate(Object JavaDoc object) {
142         if ( (object instanceof CandidateURI) &&
143                 ((Boolean JavaDoc) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))
144                     .booleanValue()) {
145             if(evaluate(((CandidateURI)object).getVia())) {
146                 return true;
147             }
148         }
149         String JavaDoc candidateSurt;
150         candidateSurt = SurtPrefixSet.getCandidateSurt(object);
151         if (candidateSurt == null) {
152             return false;
153         }
154         return getPrefixes().containsPrefixOf(candidateSurt);
155     }
156
157     /**
158      * Synchronized get of prefix set to use
159      *
160      * @return SurtPrefixSet to use for check
161      */

162     private synchronized SurtPrefixSet getPrefixes() {
163         if (surtPrefixes == null) {
164             readPrefixes();
165         }
166         return surtPrefixes;
167     }
168
169     protected void readPrefixes() {
170         buildSurtPrefixSet();
171         dumpSurtPrefixSet();
172     }
173     
174     /**
175      * Dump the current prefixes in use to configured dump file (if any)
176      */

177     protected void dumpSurtPrefixSet() {
178         // dump surts to file, if appropriate
179
String JavaDoc dumpPath = (String JavaDoc)getUncheckedAttribute(null,
180             ATTR_SURTS_DUMP_FILE);
181         if (dumpPath.length() > 0) {
182             File JavaDoc dump = new File JavaDoc(dumpPath);
183             if (!dump.isAbsolute()) {
184                 dump = new File JavaDoc(getSettingsHandler().getOrder().getController()
185                     .getDisk(), dumpPath);
186             }
187             try {
188                 FileWriter JavaDoc fw = new FileWriter JavaDoc(dump);
189                 try {
190                     surtPrefixes.exportTo(fw);
191                 } finally {
192                     fw.close();
193                 }
194             } catch (IOException JavaDoc e) {
195                 e.printStackTrace();
196                 throw new RuntimeException JavaDoc(e);
197             }
198         }
199     }
200
201     /**
202      * Construct the set of prefixes to use, from the seed list (
203      * which may include both URIs and '+'-prefixed directives).
204      */

205     protected void buildSurtPrefixSet() {
206         SurtPrefixSet newSurtPrefixes = new SurtPrefixSet();
207         FileReader JavaDoc fr = null;
208
209         // read SURTs from file, if appropriate
210
String JavaDoc sourcePath = (String JavaDoc)getUncheckedAttribute(null,
211                 ATTR_SURTS_SOURCE_FILE);
212         if (sourcePath.length() > 0) {
213             File JavaDoc source = new File JavaDoc(sourcePath);
214             if (!source.isAbsolute()) {
215                 source = new File JavaDoc(getSettingsHandler().getOrder()
216                     .getController().getDisk(), sourcePath);
217             }
218             try {
219                 fr = new FileReader JavaDoc(source);
220                 try {
221                     newSurtPrefixes.importFromMixed(fr, true);
222                 } finally {
223                     fr.close();
224                 }
225             } catch (IOException JavaDoc e) {
226                 e.printStackTrace();
227                 throw new RuntimeException JavaDoc(e);
228             }
229         }
230         
231         // interpret seeds as surts, if appropriate
232
boolean deduceFromSeeds = ((Boolean JavaDoc)getUncheckedAttribute(null,
233                 ATTR_SEEDS_AS_SURT_PREFIXES)).booleanValue();
234         if(deduceFromSeeds) {
235             try {
236                 fr = new FileReader JavaDoc(getSeedfile());
237                 try {
238                     newSurtPrefixes.importFromMixed(fr, deduceFromSeeds);
239                 } finally {
240                     fr.close();
241                 }
242             } catch (IOException JavaDoc e) {
243                 e.printStackTrace();
244                 throw new RuntimeException JavaDoc(e);
245             }
246         }
247
248         surtPrefixes = newSurtPrefixes;
249     }
250
251     /**
252      * Re-read prefixes after an update.
253      *
254      * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
255      */

256     public synchronized void kickUpdate() {
257         super.kickUpdate();
258         if (((Boolean JavaDoc) getUncheckedAttribute(null, ATTR_REBUILD_ON_RECONFIG))
259                 .booleanValue()) {
260             readPrefixes();
261         }
262         // TODO: make conditional on file having actually changed,
263
// perhaps by remembering mod-time
264
}
265
266     /**
267      * Dig through everything to get the crawl-global seeds file.
268      * Add self as listener while at it.
269      *
270      * @return Seed list file
271      */

272     protected File JavaDoc getSeedfile() {
273         CrawlScope scope =
274             getSettingsHandler().getOrder().getController().getScope();
275         scope.addSeedListener(this);
276         return scope.getSeedfile();
277     }
278
279     public synchronized void addedSeed(final CandidateURI curi) {
280         SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone();
281         newSurtPrefixes.add(prefixFrom(curi.toString()));
282         surtPrefixes = newSurtPrefixes;
283     }
284     
285     protected String JavaDoc prefixFrom(String JavaDoc uri) {
286         return SurtPrefixSet.prefixFromPlain(uri);
287     }
288 }
289
Popular Tags