KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > frontier > AntiCalendarCostAssignmentPolicy


1 /* AntiCalendarCostAssignmentPolicy
2 *
3 * $Id: AntiCalendarCostAssignmentPolicy.java,v 1.4.12.1 2007/01/13 01:31:23 stack-sf Exp $
4 *
5 * Created on Dec 15, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.frontier;
26
27 import java.util.regex.Matcher JavaDoc;
28
29 import org.archive.crawler.datamodel.CrawlURI;
30 import org.archive.util.TextUtils;
31
32 /**
33  * CostAssignmentPolicy that further penalizes URIs with
34  * calendar-suggestive strings in them, with an extra unit
35  * of cost.
36  *
37  * Will catch some 'innocent' URIs, but only when uncaught
38  * large-volume chaff is ranked higher than caught 'wheat'
39  * will this cause notable problems.
40  *
41  * @author gojomo
42  */

43 public class AntiCalendarCostAssignmentPolicy extends WagCostAssignmentPolicy {
44     public static String JavaDoc CALENDARISH =
45             "(?i)(calendar)|(year)|(month)|(day)|(date)|(viewcal)" +
46             "|(\\D19\\d\\d\\D)|(\\D20\\d\\d\\D)|(event)|(yr=)" +
47             "|(calendrier)|(jour)";
48     
49     /* (non-Javadoc)
50      * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.crawler.datamodel.CrawlURI)
51      */

52     public int costOf(CrawlURI curi) {
53         int cost = super.costOf(curi);
54         Matcher JavaDoc m = TextUtils.getMatcher(CALENDARISH, curi.toString());
55         if (m.find()) {
56             cost++;
57             // TODO: consider if multiple occurences should cost more
58
}
59         TextUtils.recycleMatcher(m);
60         return cost;
61     }
62 }
63
Popular Tags