LinksScoper


1   /* LinksScoper
2    * 
3    * $Id: LinksScoper.java,v 1.8.4.1 2007/01/13 01:31:24 stack-sf Exp $
4    *
5    * Created on Oct 2, 2003
6    * 
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   *
25   */
26  package org.archive.crawler.postprocessor;
27  
28  import java.util.Collection  ;
29  import java.util.HashSet  ;
30  import java.util.Iterator  ;
31  import java.util.logging.Level  ;
32  import java.util.logging.Logger  ;
33  
34  import org.apache.commons.httpclient.URIException;
35  import org.archive.crawler.datamodel.CandidateURI;
36  import org.archive.crawler.datamodel.CrawlURI;
37  import org.archive.crawler.datamodel.FetchStatusCodes;
38  import org.archive.crawler.extractor.Link;
39  import org.archive.crawler.framework.Filter;
40  import org.archive.crawler.framework.Scoper;
41  import org.archive.crawler.settings.MapType;
42  import org.archive.crawler.settings.SimpleType;
43  import org.archive.crawler.settings.Type;
44  
45  /**
46   * Determine which extracted links are within scope.
47   * TODO: To test scope, requires that Link be converted to
48   * a CandidateURI.  Make it so don't have to make a CandidateURI to test
49   * if Link is in scope.
50   * <p>Since this scoper has to create CandidateURIs, no sense
51   * discarding them since later in the processing chain CandidateURIs rather
52   * than Links are whats needed scheduling extracted links w/ the
53   * Frontier (Frontier#schedule expects CandidateURI, not Link).  This class
54   * replaces Links w/ the CandidateURI that wraps the Link in the CrawlURI.
55   *
56   * @author gojomo
57   * @author stack
58   */
59  public class LinksScoper extends Scoper
60  implements FetchStatusCodes {
61  
62      private static final long serialVersionUID = -4074442117992496793L;
63  
64      private static Logger   LOGGER =
65          Logger.getLogger(LinksScoper.class.getName());
66  
67      private final static String   ATTR_SEED_REDIRECTS_NEW_SEEDS =
68          "seed-redirects-new-seed";
69      
70      private final static Boolean   DEFAULT_SEED_REDIRECTS_NEW_SEEDS =
71          new Boolean  (true);
72      
73      public static final String   ATTR_LOG_REJECT_FILTERS =
74          "scope-rejected-url-filters";
75      
76      public static final String   ATTR_PREFERENCE_DEPTH_HOPS =
77          "preference-depth-hops";
78  
79      private final static Integer   DEFAULT_PREFERENCE_DEPTH_HOPS =
80          new Integer  (-1);
81      
82      /**
83       * Instance of rejected uris log filters.
84       */
85      private MapType rejectLogFilters = null;
86      
87      /**
88       * @param name Name of this filter.
89       */
90      public LinksScoper(String   name) {
91          super(name, "LinksScoper. Rules on which extracted links " +
92              "are within configured scope.");
93          
94          Type t;
95          t = addElementToDefinition(
96              new SimpleType(ATTR_SEED_REDIRECTS_NEW_SEEDS,
97              "If enabled, any URL found because a seed redirected to it " +
98              "(original seed returned 301 or 302), will also be treated " +
99              "as a seed.", DEFAULT_SEED_REDIRECTS_NEW_SEEDS));
100         t.setExpertSetting(true);
101 
102         t = addElementToDefinition(new SimpleType(ATTR_PREFERENCE_DEPTH_HOPS,
103             "Number of hops (of any sort) from a seed up to which a URI has higher " +
104         "priority scheduling than any remaining seed. For example, if set to 1 items one " + 
105         "hop (link, embed, redirect, etc.) away from a seed will be scheduled " + 
106         "with HIGH priority. If set to -1, no " + 
107         "preferencing will occur, and a breadth-first search with seeds " + 
108         "processed before discovered links will proceed. If set to zero, a " + 
109         "purely depth-first search will proceed, with all discovered links processed " + 
110         "before remaining seeds.  Seed redirects are treated as one hop from a seed.",
111         DEFAULT_PREFERENCE_DEPTH_HOPS));
112         t.setExpertSetting(true);
113         
114         this.rejectLogFilters = (MapType)addElementToDefinition(
115             new MapType(ATTR_LOG_REJECT_FILTERS, "Filters applied after " +
116                "an URI has been rejected. If filter return " +
117                "TRUE, the URI is logged (if the logging level is INFO). " +
118                "Depends on " + ATTR_OVERRIDE_LOGGER_ENABLED +
119                " being enabled.", Filter.class));
120         this.rejectLogFilters.setExpertSetting(true);
121     }
122 
123     protected void innerProcess(final CrawlURI curi) {
124         if (LOGGER.isLoggable(Level.FINEST)) {
125             LOGGER.finest(getName() + " processing " + curi);
126         }
127         
128         // If prerequisites, nothing to be done in here.
129         if (curi.hasPrerequisiteUri()) {
130             handlePrerequisite(curi);
131             return;
132         }
133         
134         // Don't extract links of error pages.
135         if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) {
136             curi.clearOutlinks();
137             return;
138         }
139         
140         if (curi.outlinksSize() <= 0) {
141             // No outlinks to process.
142             return;
143         }
144 
145         final boolean redirectsNewSeeds = ((Boolean  )getUncheckedAttribute(curi,
146             ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
147         int preferenceDepthHops = ((Integer  )getUncheckedAttribute(curi,
148             ATTR_PREFERENCE_DEPTH_HOPS)).intValue();
149         Collection  <CandidateURI> inScopeLinks = new HashSet  <CandidateURI>();
150         for (final Iterator   i = curi.getOutObjects().iterator(); i.hasNext();) {
151             Object   o = i.next();
152             if(o instanceof Link){
153                 final Link wref = (Link)o;
154                 try {
155                     final int directive = getSchedulingFor(curi, wref, 
156                         preferenceDepthHops);
157                     final CandidateURI caURI =
158                         curi.createCandidateURI(curi.getBaseURI(), wref, 
159                             directive, 
160                             considerAsSeed(curi, wref, redirectsNewSeeds));
161                     if (isInScope(caURI)) {
162                         inScopeLinks.add(caURI);
163                     }
164                 } catch (URIException e) {
165                     getController().logUriError(e, curi.getUURI(), 
166                         wref.getDestination().toString());
167                 }
168             } else if(o instanceof CandidateURI){
169                 CandidateURI caURI = (CandidateURI)o;
170                 if(isInScope(caURI)){
171                     inScopeLinks.add(caURI);
172                 }
173             } else {
174                 LOGGER.severe("Unexpected type: " + o);
175             }
176         }
177         // Replace current links collection w/ inscopeLinks.  May be
178         // an empty collection.
179         curi.replaceOutlinks(inScopeLinks);
180     }
181     
182     /**
183      * The CrawlURI has a prerequisite; apply scoping and update
184      * Link to CandidateURI in manner analogous to outlink handling. 
185      * @param curi CrawlURI with prereq to consider
186      */
187     protected void handlePrerequisite(CrawlURI curi) {
188         try {
189             // Create prerequisite CandidateURI
190             CandidateURI caUri =
191                 curi.createCandidateURI(curi.getBaseURI(),
192                     (Link) curi.getPrerequisiteUri());
193             int prereqPriority = curi.getSchedulingDirective() - 1;
194             if (prereqPriority < 0) {
195                 prereqPriority = 0;
196                 LOGGER.severe("Unable to promote prerequisite " + caUri +
197                     " above " + curi);
198             }
199             caUri.setSchedulingDirective(prereqPriority);
200             caUri.setForceFetch(true);
201             if(isInScope(caUri)) {
202                 // replace link with CandidateURI
203                 curi.setPrerequisiteUri(caUri);
204             } else {
205                 // prerequisite is out-of-scope; mark CrawlURI as error,
206                 // preventinting normal S_DEFERRED handling
207                 curi.setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
208             }
209        } catch (URIException ex) {
210             Object  [] array = {curi, curi.getPrerequisiteUri()};
211             getController().uriErrors.log(Level.INFO,ex.getMessage(), array);
212         } catch (NumberFormatException   e) {
213             // UURI.createUURI will occasionally throw this error.
214             Object  [] array = {curi, curi.getPrerequisiteUri()};
215             getController().uriErrors.log(Level.INFO,e.getMessage(), array);
216         }
217     }
218 
219     protected void outOfScope(CandidateURI caUri) {
220         super.outOfScope(caUri);
221         if (!LOGGER.isLoggable(Level.INFO)) {
222             return;
223         }
224         // TODO: Fix filters so work on CandidateURI.
225         CrawlURI curi = (caUri instanceof CrawlURI)?
226             (CrawlURI)caUri:
227             new CrawlURI(caUri.getUURI());
228         if (filtersAccept(this.rejectLogFilters, curi)) {
229             LOGGER.info(curi.getUURI().toString());
230         }
231     }
232     
233     private boolean considerAsSeed(final CrawlURI curi, final Link wref,
234             final boolean redirectsNewSeeds) {
235         // Check if this is a seed with a 301 or 302.
236         if (curi.isSeed()
237                 && (curi.getFetchStatus() == 301 ||
238                     curi.getFetchStatus() == 302)
239                 && wref.getHopType() == Link.REFER_HOP) {
240             // Check if redirects from seeds should be treated as seeds.
241             if (redirectsNewSeeds) {
242                 return true;
243             }
244         }
245         return false;
246     }
247     
248     /**
249      * Determine scheduling for the  <code>curi</code>.
250      * As with the LinksScoper in general, this only handles extracted links,
251      * seeds do not pass through here, but are given MEDIUM priority.  
252      * Imports into the frontier similarly do not pass through here, 
253      * but are given NORMAL priority.
254      */
255     protected int getSchedulingFor(final CrawlURI curi, final Link wref,
256             final int preferenceDepthHops) {
257         final char c = wref.getHopType();
258         if (LOGGER.isLoggable(Level.FINEST)) {
259             LOGGER.finest(curi + " with path=" + curi.getPathFromSeed() +
260                 " isSeed=" + curi.isSeed() + " with fetchStatus=" +
261                 curi.getFetchStatus() + " -> " + wref.getDestination() +
262                 " type " + c + " with context=" + wref.getContext());
263         }
264 
265         switch (c) {
266             case Link.REFER_HOP:
267                 // Treat redirects somewhat urgently
268                 // This also ensures seed redirects remain seed priority
269                 return (preferenceDepthHops >= 0 ? CandidateURI.HIGH :
270                     CandidateURI.MEDIUM);
271             default:
272                 if (preferenceDepthHops == 0)
273                     return CandidateURI.HIGH;
274                     // this implies seed redirects are treated as path
275                     // length 1, which I belive is standard.
276                     // curi.getPathFromSeed() can never be null here, because
277                     // we're processing a link extracted from curi
278                 if (preferenceDepthHops > 0 && 
279                     curi.getPathFromSeed().length() + 1 <= preferenceDepthHops)
280                     return CandidateURI.HIGH;
281                 // Everything else normal (at least for now)
282                 return CandidateURI.NORMAL;
283         }
284     }
285 }
286
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags