KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > postprocessor > LinksScoper


1 /* LinksScoper
2  *
3  * $Id: LinksScoper.java,v 1.8.4.1 2007/01/13 01:31:24 stack-sf Exp $
4  *
5  * Created on Oct 2, 2003
6  *
7  * Copyright (C) 2003 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  *
25  */

26 package org.archive.crawler.postprocessor;
27
28 import java.util.Collection JavaDoc;
29 import java.util.HashSet JavaDoc;
30 import java.util.Iterator JavaDoc;
31 import java.util.logging.Level JavaDoc;
32 import java.util.logging.Logger JavaDoc;
33
34 import org.apache.commons.httpclient.URIException;
35 import org.archive.crawler.datamodel.CandidateURI;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.datamodel.FetchStatusCodes;
38 import org.archive.crawler.extractor.Link;
39 import org.archive.crawler.framework.Filter;
40 import org.archive.crawler.framework.Scoper;
41 import org.archive.crawler.settings.MapType;
42 import org.archive.crawler.settings.SimpleType;
43 import org.archive.crawler.settings.Type;
44
45 /**
46  * Determine which extracted links are within scope.
47  * TODO: To test scope, requires that Link be converted to
48  * a CandidateURI. Make it so don't have to make a CandidateURI to test
49  * if Link is in scope.
50  * <p>Since this scoper has to create CandidateURIs, no sense
51  * discarding them since later in the processing chain CandidateURIs rather
52  * than Links are whats needed scheduling extracted links w/ the
53  * Frontier (Frontier#schedule expects CandidateURI, not Link). This class
54  * replaces Links w/ the CandidateURI that wraps the Link in the CrawlURI.
55  *
56  * @author gojomo
57  * @author stack
58  */

59 public class LinksScoper extends Scoper
60 implements FetchStatusCodes {
61
62     private static final long serialVersionUID = -4074442117992496793L;
63
64     private static Logger JavaDoc LOGGER =
65         Logger.getLogger(LinksScoper.class.getName());
66
67     private final static String JavaDoc ATTR_SEED_REDIRECTS_NEW_SEEDS =
68         "seed-redirects-new-seed";
69     
70     private final static Boolean JavaDoc DEFAULT_SEED_REDIRECTS_NEW_SEEDS =
71         new Boolean JavaDoc(true);
72     
73     public static final String JavaDoc ATTR_LOG_REJECT_FILTERS =
74         "scope-rejected-url-filters";
75     
76     public static final String JavaDoc ATTR_PREFERENCE_DEPTH_HOPS =
77         "preference-depth-hops";
78
79     private final static Integer JavaDoc DEFAULT_PREFERENCE_DEPTH_HOPS =
80         new Integer JavaDoc(-1);
81     
82     /**
83      * Instance of rejected uris log filters.
84      */

85     private MapType rejectLogFilters = null;
86     
87     /**
88      * @param name Name of this filter.
89      */

90     public LinksScoper(String JavaDoc name) {
91         super(name, "LinksScoper. Rules on which extracted links " +
92             "are within configured scope.");
93         
94         Type t;
95         t = addElementToDefinition(
96             new SimpleType(ATTR_SEED_REDIRECTS_NEW_SEEDS,
97             "If enabled, any URL found because a seed redirected to it " +
98             "(original seed returned 301 or 302), will also be treated " +
99             "as a seed.", DEFAULT_SEED_REDIRECTS_NEW_SEEDS));
100         t.setExpertSetting(true);
101
102         t = addElementToDefinition(new SimpleType(ATTR_PREFERENCE_DEPTH_HOPS,
103             "Number of hops (of any sort) from a seed up to which a URI has higher " +
104         "priority scheduling than any remaining seed. For example, if set to 1 items one " +
105         "hop (link, embed, redirect, etc.) away from a seed will be scheduled " +
106         "with HIGH priority. If set to -1, no " +
107         "preferencing will occur, and a breadth-first search with seeds " +
108         "processed before discovered links will proceed. If set to zero, a " +
109         "purely depth-first search will proceed, with all discovered links processed " +
110         "before remaining seeds. Seed redirects are treated as one hop from a seed.",
111         DEFAULT_PREFERENCE_DEPTH_HOPS));
112         t.setExpertSetting(true);
113         
114         this.rejectLogFilters = (MapType)addElementToDefinition(
115             new MapType(ATTR_LOG_REJECT_FILTERS, "Filters applied after " +
116                "an URI has been rejected. If filter return " +
117                "TRUE, the URI is logged (if the logging level is INFO). " +
118                "Depends on " + ATTR_OVERRIDE_LOGGER_ENABLED +
119                " being enabled.", Filter.class));
120         this.rejectLogFilters.setExpertSetting(true);
121     }
122
123     protected void innerProcess(final CrawlURI curi) {
124         if (LOGGER.isLoggable(Level.FINEST)) {
125             LOGGER.finest(getName() + " processing " + curi);
126         }
127         
128         // If prerequisites, nothing to be done in here.
129
if (curi.hasPrerequisiteUri()) {
130             handlePrerequisite(curi);
131             return;
132         }
133         
134         // Don't extract links of error pages.
135
if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) {
136             curi.clearOutlinks();
137             return;
138         }
139         
140         if (curi.outlinksSize() <= 0) {
141             // No outlinks to process.
142
return;
143         }
144
145         final boolean redirectsNewSeeds = ((Boolean JavaDoc)getUncheckedAttribute(curi,
146             ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
147         int preferenceDepthHops = ((Integer JavaDoc)getUncheckedAttribute(curi,
148             ATTR_PREFERENCE_DEPTH_HOPS)).intValue();
149         Collection JavaDoc<CandidateURI> inScopeLinks = new HashSet JavaDoc<CandidateURI>();
150         for (final Iterator JavaDoc i = curi.getOutObjects().iterator(); i.hasNext();) {
151             Object JavaDoc o = i.next();
152             if(o instanceof Link){
153                 final Link wref = (Link)o;
154                 try {
155                     final int directive = getSchedulingFor(curi, wref,
156                         preferenceDepthHops);
157                     final CandidateURI caURI =
158                         curi.createCandidateURI(curi.getBaseURI(), wref,
159                             directive,
160                             considerAsSeed(curi, wref, redirectsNewSeeds));
161                     if (isInScope(caURI)) {
162                         inScopeLinks.add(caURI);
163                     }
164                 } catch (URIException e) {
165                     getController().logUriError(e, curi.getUURI(),
166                         wref.getDestination().toString());
167                 }
168             } else if(o instanceof CandidateURI){
169                 CandidateURI caURI = (CandidateURI)o;
170                 if(isInScope(caURI)){
171                     inScopeLinks.add(caURI);
172                 }
173             } else {
174                 LOGGER.severe("Unexpected type: " + o);
175             }
176         }
177         // Replace current links collection w/ inscopeLinks. May be
178
// an empty collection.
179
curi.replaceOutlinks(inScopeLinks);
180     }
181     
182     /**
183      * The CrawlURI has a prerequisite; apply scoping and update
184      * Link to CandidateURI in manner analogous to outlink handling.
185      * @param curi CrawlURI with prereq to consider
186      */

187     protected void handlePrerequisite(CrawlURI curi) {
188         try {
189             // Create prerequisite CandidateURI
190
CandidateURI caUri =
191                 curi.createCandidateURI(curi.getBaseURI(),
192                     (Link) curi.getPrerequisiteUri());
193             int prereqPriority = curi.getSchedulingDirective() - 1;
194             if (prereqPriority < 0) {
195                 prereqPriority = 0;
196                 LOGGER.severe("Unable to promote prerequisite " + caUri +
197                     " above " + curi);
198             }
199             caUri.setSchedulingDirective(prereqPriority);
200             caUri.setForceFetch(true);
201             if(isInScope(caUri)) {
202                 // replace link with CandidateURI
203
curi.setPrerequisiteUri(caUri);
204             } else {
205                 // prerequisite is out-of-scope; mark CrawlURI as error,
206
// preventinting normal S_DEFERRED handling
207
curi.setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
208             }
209        } catch (URIException ex) {
210             Object JavaDoc[] array = {curi, curi.getPrerequisiteUri()};
211             getController().uriErrors.log(Level.INFO,ex.getMessage(), array);
212         } catch (NumberFormatException JavaDoc e) {
213             // UURI.createUURI will occasionally throw this error.
214
Object JavaDoc[] array = {curi, curi.getPrerequisiteUri()};
215             getController().uriErrors.log(Level.INFO,e.getMessage(), array);
216         }
217     }
218
219     protected void outOfScope(CandidateURI caUri) {
220         super.outOfScope(caUri);
221         if (!LOGGER.isLoggable(Level.INFO)) {
222             return;
223         }
224         // TODO: Fix filters so work on CandidateURI.
225
CrawlURI curi = (caUri instanceof CrawlURI)?
226             (CrawlURI)caUri:
227             new CrawlURI(caUri.getUURI());
228         if (filtersAccept(this.rejectLogFilters, curi)) {
229             LOGGER.info(curi.getUURI().toString());
230         }
231     }
232     
233     private boolean considerAsSeed(final CrawlURI curi, final Link wref,
234             final boolean redirectsNewSeeds) {
235         // Check if this is a seed with a 301 or 302.
236
if (curi.isSeed()
237                 && (curi.getFetchStatus() == 301 ||
238                     curi.getFetchStatus() == 302)
239                 && wref.getHopType() == Link.REFER_HOP) {
240             // Check if redirects from seeds should be treated as seeds.
241
if (redirectsNewSeeds) {
242                 return true;
243             }
244         }
245         return false;
246     }
247     
248     /**
249      * Determine scheduling for the <code>curi</code>.
250      * As with the LinksScoper in general, this only handles extracted links,
251      * seeds do not pass through here, but are given MEDIUM priority.
252      * Imports into the frontier similarly do not pass through here,
253      * but are given NORMAL priority.
254      */

255     protected int getSchedulingFor(final CrawlURI curi, final Link wref,
256             final int preferenceDepthHops) {
257         final char c = wref.getHopType();
258         if (LOGGER.isLoggable(Level.FINEST)) {
259             LOGGER.finest(curi + " with path=" + curi.getPathFromSeed() +
260                 " isSeed=" + curi.isSeed() + " with fetchStatus=" +
261                 curi.getFetchStatus() + " -> " + wref.getDestination() +
262                 " type " + c + " with context=" + wref.getContext());
263         }
264
265         switch (c) {
266             case Link.REFER_HOP:
267                 // Treat redirects somewhat urgently
268
// This also ensures seed redirects remain seed priority
269
return (preferenceDepthHops >= 0 ? CandidateURI.HIGH :
270                     CandidateURI.MEDIUM);
271             default:
272                 if (preferenceDepthHops == 0)
273                     return CandidateURI.HIGH;
274                     // this implies seed redirects are treated as path
275
// length 1, which I belive is standard.
276
// curi.getPathFromSeed() can never be null here, because
277
// we're processing a link extracted from curi
278
if (preferenceDepthHops > 0 &&
279                     curi.getPathFromSeed().length() + 1 <= preferenceDepthHops)
280                     return CandidateURI.HIGH;
281                 // Everything else normal (at least for now)
282
return CandidateURI.NORMAL;
283         }
284     }
285 }
286
Popular Tags