KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > postprocessor > SupplementaryLinksScoper


1 /* SupplementaryLinksScoper
2  *
3  * $Id: SupplementaryLinksScoper.java,v 1.3.14.1 2007/01/13 01:31:24 stack-sf Exp $
4  *
5  * Created on Oct 2, 2003
6  *
7  * Copyright (C) 2003 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  *
25  */

26 package org.archive.crawler.postprocessor;
27
28 import java.util.Collection JavaDoc;
29 import java.util.HashSet JavaDoc;
30 import java.util.logging.Level JavaDoc;
31 import java.util.logging.Logger JavaDoc;
32
33 import org.archive.crawler.datamodel.CandidateURI;
34 import org.archive.crawler.datamodel.CrawlURI;
35 import org.archive.crawler.framework.Filter;
36 import org.archive.crawler.framework.Scoper;
37 import org.archive.crawler.settings.MapType;
38
39 /**
40  * Run CandidateURI links carried in the passed CrawlURI through a filter
41  * and 'handle' rejections.
42  * Used to do supplementary processing of links after they've been scope
43  * processed and ruled 'in-scope' by LinkScoper. An example of
44  * 'supplementary processing' would check that a Link is intended for
45  * this host to crawl in a multimachine crawl setting. Configure filters to
46  * rule on links. Default handler writes rejected URLs to disk. Subclass
47  * to handle rejected URLs otherwise.
48  * @author stack
49  */

50 public class SupplementaryLinksScoper extends Scoper {
51
52     private static final long serialVersionUID = -775819977752790418L;
53
54     private static Logger JavaDoc LOGGER =
55         Logger.getLogger(SupplementaryLinksScoper.class.getName());
56     
57     public static final String JavaDoc ATTR_LINK_FILTERS = "link-filters";
58     
59     /**
60      * Instance of filters to run.
61      */

62     private MapType filters = null;
63     
64     
65     /**
66      * @param name Name of this filter.
67      */

68     public SupplementaryLinksScoper(String JavaDoc name) {
69         super(name, "SupplementaryLinksScoper. Use to do supplementary " +
70             "processing of in-scope links. Will run each link through " +
71             "configured filters. Must be run after LinkScoper and " +
72             "before FrontierScheduler. " +
73             "Optionally logs rejected links (Enable " +
74             ATTR_OVERRIDE_LOGGER_ENABLED + " and set logger level " +
75             "at INFO or above).");
76         
77         this.filters = (MapType)addElementToDefinition(
78             new MapType(ATTR_LINK_FILTERS, "Filters to apply to each " +
79             "link carried by the passed CrawlURI.", Filter.class));
80         this.filters.setExpertSetting(true);
81     }
82
83     protected void innerProcess(final CrawlURI curi) {
84         // If prerequisites or no links, nothing to be done in here.
85
if (curi.hasPrerequisiteUri() || curi.outlinksSize() <= 0) {
86             return;
87         }
88         
89         Collection JavaDoc<CandidateURI> inScopeLinks = new HashSet JavaDoc<CandidateURI>();
90         for (CandidateURI cauri: curi.getOutCandidates()) {
91             if (isInScope(cauri)) {
92                 inScopeLinks.add(cauri);
93             }
94         }
95         // Replace current links collection w/ inscopeLinks. May be
96
// an empty collection.
97
curi.replaceOutlinks(inScopeLinks);
98     }
99     
100     protected boolean isInScope(CandidateURI caUri) {
101         // TODO: Fix filters so work on CandidateURI.
102
CrawlURI curi = (caUri instanceof CrawlURI)?
103             (CrawlURI)caUri:
104             new CrawlURI(caUri.getUURI());
105         boolean result = false;
106         if (filtersAccept(this.filters, curi)) {
107             result = true;
108             if (LOGGER.isLoggable(Level.FINER)) {
109                 LOGGER.finer("Accepted: " + caUri);
110             }
111         } else {
112             outOfScope(caUri);
113         }
114         return result;
115     }
116     
117     /**
118      * Called when a CandidateUri is ruled out of scope.
119      * @param caUri CandidateURI that is out of scope.
120      */

121     protected void outOfScope(CandidateURI caUri) {
122         if (!LOGGER.isLoggable(Level.INFO)) {
123             return;
124         }
125         LOGGER.info(caUri.getUURI().toString());
126     }
127 }
Popular Tags