KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > scope > SurtPrefixScope


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * SurtPrefixScope.java
20  * Created on Oct 1, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/scope/SurtPrefixScope.java,v 1.18.2.1 2007/01/13 01:31:26 stack-sf Exp $
23  */

24 package org.archive.crawler.scope;
25
26 import java.io.File JavaDoc;
27 import java.io.FileReader JavaDoc;
28 import java.io.FileWriter JavaDoc;
29 import java.io.IOException JavaDoc;
30
31 import org.archive.crawler.datamodel.CandidateURI;
32 import org.archive.crawler.deciderules.DecidingScope;
33 import org.archive.crawler.framework.CrawlController;
34 import org.archive.crawler.settings.SimpleType;
35 import org.archive.crawler.settings.Type;
36 import org.archive.util.SurtPrefixSet;
37
38 /**
39  * A specialized CrawlScope suitable for the most common crawl needs.
40  *
41  * Roughly, as with other existing CrawlScope variants, SurtPrefixScope's logic
42  * is that a URI is included if:
43  * <pre>
44  * ( isSeed(uri) || focusFilter.accepts(uri) ) ||
45  * transitiveFilter.accepts(uri) ) && ! excludeFilter.accepts(uri)
46  * </pre>
47  * Specifically, SurtPrefixScope uses a SurtFilter to test for focus-inclusion.
48  *
49  * @author gojomo
50  * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
51  */

52 public class SurtPrefixScope extends RefinedScope {
53
54     private static final long serialVersionUID = 2652008287322770123L;
55
56     public static final String JavaDoc ATTR_SURTS_SOURCE_FILE = "surts-source-file";
57     public static final String JavaDoc ATTR_SEEDS_AS_SURT_PREFIXES = "seeds-as-surt-prefixes";
58     public static final String JavaDoc ATTR_SURTS_DUMP_FILE = "surts-dump-file";
59     
60     private static final Boolean JavaDoc DEFAULT_SEEDS_AS_SURT_PREFIXES = new Boolean JavaDoc(true);
61
62     /**
63      * Whether the 'via' of CrawlURIs should also be checked
64      * to see if it is prefixed by the set of SURT prefixes
65      */

66     public static final String JavaDoc
67         ATTR_ALSO_CHECK_VIA = "also-check-via";
68     public static final Boolean JavaDoc
69         DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
70     
71     SurtPrefixSet surtPrefixes = null;
72
73     public SurtPrefixScope(String JavaDoc name) {
74         super(name);
75         setDescription(
76                 "SurtPrefixScope: A scope for crawls limited to regions of " +
77                 "the web defined by a set of SURT prefixes *Deprecated* " +
78                 "Use DecidingScope instead. (The SURT form of " +
79                 "a URI has its hostname reordered to ease sorting and "
80                 + "grouping by domain hierarchies.)");
81         addElementToDefinition(
82                 new SimpleType(ATTR_SURTS_SOURCE_FILE,
83                         "Source file from which to infer SURT prefixes. Any URLs " +
84                         "in file will be converted to the implied SURT prefix, and " +
85                         "literal SURT prefixes may be listed on lines beginning " +
86                         "with a '+' character.",
87                         ""));
88         addElementToDefinition(
89                 new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES,
90                         "Should seeds also be interpreted as SURT prefixes.",
91                         DEFAULT_SEEDS_AS_SURT_PREFIXES));
92         
93         Type t = addElementToDefinition(
94                 new SimpleType(ATTR_SURTS_DUMP_FILE,
95                         "Dump file to save SURT prefixes actually used.",
96                         ""));
97         t.setExpertSetting(true);
98         t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,
99                 "Whether to also rule URI in-scope if a " +
100                 "URI's 'via' URI (the URI from which it was discovered) " +
101                 "in SURT form begins with any of the established prefixes. " +
102                 "For example, can be used to accept URIs that are 'one hop " +
103                 "off' URIs fitting the SURT prefixes. Default is false.",
104                 DEFAULT_ALSO_CHECK_VIA));
105         t.setOverrideable(false);
106         t.setExpertSetting(true);
107
108     }
109
110     
111     /* (non-Javadoc)
112      * @see org.archive.crawler.framework.CrawlScope#initialize(org.archive.crawler.framework.CrawlController)
113      */

114     public void initialize(CrawlController controller) {
115         super.initialize(controller);
116         readPrefixes();
117     }
118     
119     /**
120      * Check if a URI is part of this scope.
121      *
122      * @param object
123      * An instance of UURI or of CandidateURI.
124      * @return True if focus filter accepts passed object.
125      */

126     protected synchronized boolean focusAccepts(Object JavaDoc object) {
127         // TODO: eliminate duplication wrt/SurtPrefixedDecideRule.evaluate
128
if (surtPrefixes == null) {
129             readPrefixes();
130         }
131         if ( (object instanceof CandidateURI) &&
132                 ((Boolean JavaDoc) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))
133                     .booleanValue()) {
134             if(focusAccepts(((CandidateURI)object).getVia())) {
135                 return true;
136             }
137         }
138         String JavaDoc candidateSurt = SurtPrefixSet.getCandidateSurt(object);
139         if(candidateSurt == null) {
140             return false;
141         }
142         return surtPrefixes.containsPrefixOf(candidateSurt);
143     }
144     
145     private void readPrefixes() {
146         surtPrefixes = new SurtPrefixSet();
147         FileReader JavaDoc fr = null;
148         
149         // read SURTs from file, if appropriate
150
String JavaDoc sourcePath = (String JavaDoc) getUncheckedAttribute(null,
151                 ATTR_SURTS_SOURCE_FILE);
152         if(sourcePath.length()>0) {
153             File JavaDoc source = new File JavaDoc(sourcePath);
154             if (!source.isAbsolute()) {
155                 source = new File JavaDoc(getSettingsHandler().getOrder()
156                         .getController().getDisk(), sourcePath);
157             }
158             try {
159                 fr = new FileReader JavaDoc(source);
160                 try {
161                     surtPrefixes.importFromMixed(fr,true);
162                 } finally {
163                     fr.close();
164                 }
165         
166             } catch (IOException JavaDoc e) {
167                 e.printStackTrace();
168                 throw new RuntimeException JavaDoc(e);
169             }
170         }
171         
172         // interpret seeds as surts, if appropriate
173
boolean deduceFromSeeds =
174             ((Boolean JavaDoc) getUncheckedAttribute(null, ATTR_SEEDS_AS_SURT_PREFIXES))
175             .booleanValue();
176         try {
177             fr = new FileReader JavaDoc(getSeedfile());
178             try {
179                 surtPrefixes.importFromMixed(fr,deduceFromSeeds);
180             } finally {
181                 fr.close();
182             }
183         } catch (IOException JavaDoc e) {
184             e.printStackTrace();
185             throw new RuntimeException JavaDoc(e);
186         }
187
188         // dump surts to file, if appropriate
189
String JavaDoc dumpPath = (String JavaDoc) getUncheckedAttribute(null,
190                 ATTR_SURTS_DUMP_FILE);
191         if(dumpPath.length()>0) {
192             File JavaDoc dump = new File JavaDoc(dumpPath);
193             if (!dump.isAbsolute()) {
194                 dump = new File JavaDoc(getSettingsHandler().getOrder()
195                         .getController().getDisk(), dumpPath);
196             }
197             try {
198                 FileWriter JavaDoc fw = new FileWriter JavaDoc(dump);
199                 try {
200                     surtPrefixes.exportTo(fw);
201                 } finally {
202                     fw.close();
203                 }
204             } catch (IOException JavaDoc e) {
205                 e.printStackTrace();
206                 throw new RuntimeException JavaDoc(e);
207             }
208         }
209     }
210
211     /**
212      * Re-read prefixes after an update.
213      *
214      * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
215      */

216     public synchronized void kickUpdate() {
217         super.kickUpdate();
218         // TODO: make conditional on file having actually changed,
219
// perhaps by remembering mod-time
220
readPrefixes();
221     }
222 }
223
Popular Tags