KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > frontier > WagCostAssignmentPolicy


1 /* WagCostAssignmentPolicy
2 *
3 * $Id: WagCostAssignmentPolicy.java,v 1.4 2005/07/18 17:29:31 stack-sf Exp $
4 *
5 * Created on Dec 10, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.frontier;
26
27 import org.archive.crawler.datamodel.CrawlURI;
28 import org.archive.net.UURI;
29
30 /**
31  * A CostAssignmentPolicy based on some wild guesses of kinds of URIs
32  * that should be deferred into the (potentially never-crawled) future.
33  *
34  * @author gojomo
35  */

36 public class WagCostAssignmentPolicy extends CostAssignmentPolicy {
37
38     /**
39      * Add constant penalties for certain features of URI (and
40      * its 'via') that make it more delayable/skippable.
41      *
42      * @param curi CrawlURI to be assigned a cost
43      *
44      * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.crawler.datamodel.CrawlURI)
45      */

46     public int costOf(CrawlURI curi) {
47         int cost = 1;
48         UURI uuri = curi.getUURI();
49         if (uuri.hasQuery()) {
50             // has query string
51
cost++;
52             int qIndex = uuri.toString().indexOf('?');
53             if (curi.flattenVia().startsWith(uuri.toString().substring(0,qIndex))) {
54                 // non-query-string portion of URI is same as previous
55
cost++;
56             }
57             // TODO: other potential query-related cost penalties:
58
// - more than X query-string attributes
59
// - calendarish terms
60
// - query-string over certain size
61
}
62         // TODO: other potential path-based penalties
63
// - new path is simply extension of via path
64
// - many path segments
65
// TODO: other potential hops-based penalties
66
// - more than X hops
67
// - each speculative hop
68
return cost;
69     }
70 }
71
Popular Tags