1 23 package org.archive.crawler.url.canonicalize; 24 25 import java.util.regex.Pattern ; 26 27 28 29 38 public class StripWWWNRule extends BaseRule { 39 private static final long serialVersionUID = 3619916990307308590L; 40 41 private static final String DESCRIPTION = "Strip any 'www[0-9]*' found. " + 42 "Use this rule to equate 'http://www.archive.org/index.html' and " + 43 "'http://www0001.archive.org/index.html' with " + 44 "'http://archive.org/index.html'. The resulting canonicalization " + 45 "returns 'http://archive.org/index.html'. It removes any www's " + 46 "or wwwNNN's found, where 'N' is one or more numerics, EXCEPT " + 47 "on URIs that have no path/query component " + 48 ". Top-level 'slash page' URIs are left unstripped: we prefer " + 49 "crawling redundant top pages to missing an entire site only " + 50 "available from either the www-full or www-less hostname, but not " + 51 "both. Operates on http and https schemes only. " + 52 "Use StripWWWRule to strip a lone 'www' only (This rule is a " + 53 "more general version of StripWWWRule)."; 54 55 private static final Pattern REGEX = 56 Pattern.compile("(?i)^(https?://)(?:www[0-9]*\\.)([^/]*/.+)$"); 57 58 public StripWWWNRule(String name) { 59 super(name, DESCRIPTION); 60 } 61 62 public String canonicalize(String url, Object context) { 63 return doStripRegexMatch(url, REGEX.matcher(url)); 64 } 65 } | Popular Tags |