KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > url > canonicalize > StripWWWNRule


1 /* StripWWWRule
2  *
3  * Created on Oct 5, 2004
4  *
5  * Copyright (C) 2004 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.url.canonicalize;
24
25 import java.util.regex.Pattern JavaDoc;
26
27
28
29 /**
30  * Strip any 'www[0-9]*' found on http/https URLs IF they have some
31  * path/query component (content after third slash). Top 'slash page'
32  * URIs are left unstripped: we prefer crawling redundant
33  * top pages to missing an entire site only available from either
34  * the www-full or www-less hostname, but not both.
35  * @author stack
36  * @version $Date: 2007/01/13 01:31:29 $, $Revision: 1.2.6.2 $
37  */

38 public class StripWWWNRule extends BaseRule {
39     private static final long serialVersionUID = 3619916990307308590L;
40
41     private static final String JavaDoc DESCRIPTION = "Strip any 'www[0-9]*' found. " +
42         "Use this rule to equate 'http://www.archive.org/index.html' and " +
43         "'http://www0001.archive.org/index.html' with " +
44         "'http://archive.org/index.html'. The resulting canonicalization " +
45         "returns 'http://archive.org/index.html'. It removes any www's " +
46         "or wwwNNN's found, where 'N' is one or more numerics, EXCEPT " +
47         "on URIs that have no path/query component " +
48         ". Top-level 'slash page' URIs are left unstripped: we prefer " +
49         "crawling redundant top pages to missing an entire site only " +
50         "available from either the www-full or www-less hostname, but not " +
51         "both. Operates on http and https schemes only. " +
52         "Use StripWWWRule to strip a lone 'www' only (This rule is a " +
53         "more general version of StripWWWRule).";
54     
55     private static final Pattern JavaDoc REGEX =
56         Pattern.compile("(?i)^(https?://)(?:www[0-9]*\\.)([^/]*/.+)$");
57
58     public StripWWWNRule(String JavaDoc name) {
59         super(name, DESCRIPTION);
60     }
61
62     public String JavaDoc canonicalize(String JavaDoc url, Object JavaDoc context) {
63         return doStripRegexMatch(url, REGEX.matcher(url));
64     }
65 }
Popular Tags