KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > framework > CrawlScope


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * CrawlScope.java
20  * Created on Oct 1, 2003
21  *
22  * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/framework/CrawlScope.java,v 1.49.14.1 2007/01/13 01:31:21 stack-sf Exp $
23  */

24 package org.archive.crawler.framework;
25
26 import java.io.BufferedReader JavaDoc;
27 import java.io.File JavaDoc;
28 import java.io.FileReader JavaDoc;
29 import java.io.FileWriter JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.io.Writer JavaDoc;
32 import java.util.HashSet JavaDoc;
33 import java.util.Iterator JavaDoc;
34 import java.util.List JavaDoc;
35 import java.util.Set JavaDoc;
36 import java.util.logging.Logger JavaDoc;
37
38 import javax.management.AttributeNotFoundException JavaDoc;
39 import javax.management.MBeanException JavaDoc;
40 import javax.management.ReflectionException JavaDoc;
41
42 import org.apache.commons.httpclient.URIException;
43 import org.archive.crawler.datamodel.CandidateURI;
44 import org.archive.crawler.scope.SeedFileIterator;
45 import org.archive.crawler.scope.SeedListener;
46 import org.archive.crawler.settings.CrawlerSettings;
47 import org.archive.crawler.settings.SimpleType;
48 import org.archive.crawler.settings.Type;
49 import org.archive.net.UURI;
50 import org.archive.util.DevUtils;
51
52 /**
53  * A CrawlScope instance defines which URIs are "in"
54  * a particular crawl.
55  *
56  * It is essentially a Filter which determines, looking at
57  * the totality of information available about a
58  * CandidateURI/CrawlURI instamce, if that URI should be
59  * scheduled for crawling.
60  *
61  * Dynamic information inherent in the discovery of the
62  * URI -- such as the path by which it was discovered --
63  * may be considered.
64  *
65  * Dynamic information which requires the consultation
66  * of external and potentially volatile information --
67  * such as current robots.txt requests and the history
68  * of attempts to crawl the same URI -- should NOT be
69  * considered. Those potentially high-latency decisions
70  * should be made at another step.
71  *
72  * @author gojomo
73  *
74  */

75 public class CrawlScope extends Filter {
76
77     private static final long serialVersionUID = -3321533224526211277L;
78
79     private static final Logger JavaDoc logger =
80         Logger.getLogger(CrawlScope.class.getName());
81     public static final String JavaDoc ATTR_NAME = "scope";
82     public static final String JavaDoc ATTR_SEEDS = "seedsfile";
83     
84     /**
85      * Whether every configu change should trigger a
86      * rereading of the original seeds spec/file.
87      */

88     public static final String JavaDoc
89         ATTR_REREAD_SEEDS_ON_CONFIG = "reread-seeds-on-config";
90     public static final Boolean JavaDoc
91         DEFAULT_REREAD_SEEDS_ON_CONFIG = Boolean.TRUE;
92     
93     protected Set JavaDoc<SeedListener> seedListeners = new HashSet JavaDoc<SeedListener>();
94
95     /** Constructs a new CrawlScope.
96      *
97      * @param name the name is ignored since it always have to be the value of
98      * the constant ATT_NAME.
99      */

100     public CrawlScope(String JavaDoc name) {
101         // 'name' is never used.
102
super(ATTR_NAME, "Crawl scope");
103         Type t;
104         t = addElementToDefinition(new SimpleType(ATTR_SEEDS,
105                 "File from which to extract seeds.", "seeds.txt"));
106         t.setOverrideable(false);
107         t.setExpertSetting(true);
108         t = addElementToDefinition(new SimpleType(ATTR_REREAD_SEEDS_ON_CONFIG,
109                 "Whether to reread the seeds specification, whether it has " +
110                 "changed or not, every time any configuration change occurs. " +
111                 "If true, seeds are reread even when (for example) new " +
112                 "domain overrides are set. Rereading the seeds can take a " +
113                 "long time with large seed lists.",
114                 DEFAULT_REREAD_SEEDS_ON_CONFIG));
115         t.setOverrideable(false);
116         t.setExpertSetting(true);
117
118     }
119
120     /** Default constructor.
121      */

122     public CrawlScope() {
123         this(ATTR_NAME);
124     }
125
126     /**
127      * Initialize is called just before the crawler starts to run.
128      *
129      * The settings system is up and initialized so can be used. This
130      * initialize happens after {@link #earlyInitialize(CrawlerSettings)}.
131      *
132      * @param controller Controller object.
133      */

134     public void initialize(CrawlController controller) {
135         // by default do nothing (subclasses override)
136
}
137
138     public String JavaDoc toString() {
139         return "CrawlScope<" + getName() + ">";
140     }
141
142     /**
143      * Refresh seeds.
144      *
145      */

146     public void refreshSeeds() {
147         // by default do nothing (subclasses which cache should override)
148
}
149
150     /**
151      * @return Seed list file or null if problem getting settings file.
152      */

153     public File JavaDoc getSeedfile() {
154         File JavaDoc file = null;
155         try {
156             file = getSettingsHandler().getPathRelativeToWorkingDirectory(
157                 (String JavaDoc)getAttribute(ATTR_SEEDS));
158             if (!file.exists() || !file.canRead()) {
159                 throw new IOException JavaDoc("Seeds file " +
160                     file.getAbsolutePath() + " does not exist or unreadable.");
161             }
162         } catch (IOException JavaDoc e) {
163             DevUtils.warnHandle(e, "problem reading seeds");
164         } catch (AttributeNotFoundException JavaDoc e) {
165             DevUtils.warnHandle(e, "problem reading seeds");
166         } catch (MBeanException JavaDoc e) {
167             DevUtils.warnHandle(e, "problem reading seeds");
168             e.printStackTrace();
169         } catch (ReflectionException JavaDoc e) {
170             DevUtils.warnHandle(e, "problem reading seeds");
171             e.printStackTrace();
172         }
173
174         return file;
175     }
176
177     /** Check if a URI is in the seeds.
178      *
179      * @param o the URI to check.
180      * @return true if URI is a seed.
181      */

182     protected boolean isSeed(Object JavaDoc o) {
183         return o instanceof CandidateURI && ((CandidateURI) o).isSeed();
184     }
185
186     /**
187      * @param a First UURI of compare.
188      * @param b Second UURI of compare.
189      * @return True if UURIs are of same host.
190      */

191     protected boolean isSameHost(UURI a, UURI b) {
192         boolean isSameHost = false;
193         if (a != null && b != null) {
194             // getHost can come back null. See
195
// "[ 910120 ] java.net.URI#getHost fails when leading digit"
196
try {
197                 if (a.getReferencedHost() != null && b.getReferencedHost() != null) {
198                     if (a.getReferencedHost().equals(b.getReferencedHost())) {
199                         isSameHost = true;
200                     }
201                 }
202             }
203             catch (URIException e) {
204                 logger.severe("Failed compare of " + a + " " + b + ": " +
205                     e.getMessage());
206             }
207         }
208         return isSameHost;
209     }
210
211
212
213     /* (non-Javadoc)
214      * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
215      */

216     public void listUsedFiles(List JavaDoc<String JavaDoc> list){
217         // Add seed file
218
try {
219             File JavaDoc file = getSettingsHandler().getPathRelativeToWorkingDirectory(
220                     (String JavaDoc)getAttribute(ATTR_SEEDS));
221             list.add(file.getAbsolutePath());
222         } catch (AttributeNotFoundException JavaDoc e) {
223             // TODO Auto-generated catch block
224
e.printStackTrace();
225         } catch (MBeanException JavaDoc e) {
226             // TODO Auto-generated catch block
227
e.printStackTrace();
228         } catch (ReflectionException JavaDoc e) {
229             // TODO Auto-generated catch block
230
e.printStackTrace();
231         }
232     }
233
234     /**
235      * Take note of a situation (such as settings edit) where
236      * involved reconfiguration (such as reading from external
237      * files) may be necessary.
238      */

239     public void kickUpdate() {
240         // TODO: further improve this so that case with hundreds of
241
// thousands or millions of seeds works better without requiring
242
// this specific settings check
243
if (((Boolean JavaDoc) getUncheckedAttribute(null, ATTR_REREAD_SEEDS_ON_CONFIG))
244                 .booleanValue()) {
245             refreshSeeds();
246             getSettingsHandler().getOrder().getController().getFrontier().loadSeeds();
247         }
248     }
249
250     /**
251      * Gets an iterator over all configured seeds. Subclasses
252      * which cache seeds in memory can override with more
253      * efficient implementation.
254      *
255      * @return Iterator, perhaps over a disk file, of seeds
256      */

257     public Iterator JavaDoc<UURI> seedsIterator() {
258         return seedsIterator(null);
259     }
260     
261     /**
262      * Gets an iterator over all configured seeds. Subclasses
263      * which cache seeds in memory can override with more
264      * efficient implementation.
265      *
266      * @param ignoredItemWriter optional writer to get ignored seed items report
267      * @return Iterator, perhaps over a disk file, of seeds
268      */

269     public Iterator JavaDoc<UURI> seedsIterator(Writer JavaDoc ignoredItemWriter) {
270         BufferedReader JavaDoc br;
271         try {
272             br = new BufferedReader JavaDoc(new FileReader JavaDoc(getSeedfile()));
273         } catch (IOException JavaDoc e) {
274             throw new RuntimeException JavaDoc(e);
275         }
276         return new SeedFileIterator(br,ignoredItemWriter);
277     }
278     
279     /**
280      * Convenience method to close SeedFileIterator, if appropriate.
281      *
282      * @param iter Iterator to check if SeedFileIterator needing closing
283      */

284     protected void checkClose(Iterator JavaDoc iter) {
285         if(iter instanceof SeedFileIterator) {
286             ((SeedFileIterator)iter).close();
287         }
288     }
289     
290     /**
291      * Add a new seed to scope. By default, simply appends
292      * to seeds file, though subclasses may handle differently.
293      *
294      * <p>This method is *not* sufficient to get the new seed
295      * scheduled in the Frontier for crawling -- it only
296      * affects the Scope's seed record (and decisions which
297      * flow from seeds).
298      *
299      * @param curi CandidateUri to add
300      * @return true if successful, false if add failed for any reason
301      */

302     public boolean addSeed(final CandidateURI curi) {
303         File JavaDoc f = getSeedfile();
304         if (f != null) {
305             try {
306                 FileWriter JavaDoc fw = new FileWriter JavaDoc(f, true);
307                 // Write to new (last) line the URL.
308
fw.write("\n");
309                 fw.write("# Heritrix added seed " +
310                     ((curi.getVia() != null) ? "redirect from " + curi.getVia():
311                         "(JMX)") + ".\n");
312                 fw.write(curi.toString());
313                 fw.flush();
314                 fw.close();
315                 Iterator JavaDoc iter = seedListeners.iterator();
316                 while(iter.hasNext()) {
317                     ((SeedListener)iter.next()).addedSeed(curi);
318                 }
319                 return true;
320             } catch (IOException JavaDoc e) {
321                 DevUtils.warnHandle(e, "problem writing new seed");
322             }
323         }
324         return false;
325     }
326     
327     public void addSeedListener(SeedListener sl) {
328         seedListeners.add(sl);
329     }
330 }
331
Popular Tags