1 23 package org.archive.crawler.url.canonicalize; 24 25 import java.util.regex.Pattern ; 26 27 28 29 38 public class StripWWWRule extends BaseRule { 39 40 private static final long serialVersionUID = -5416391108485746976L; 41 42 private static final String DESCRIPTION = "Strip any 'www' found. " + 43 "Use this rule to equate 'http://www.archive.org/index.html' and" + 44 " 'http://archive.org/index.html'. The resulting canonicalization" + 45 " returns 'http://archive.org/index.html'. It removes any www's " + 46 "found, except on URIs that have no path/query component " + 47 "('slash' pages). Operates on http and https schemes only. " + 48 "Use the more general StripWWWNRule if you want to strip both 'www' " + 49 "and 'www01', 'www02', etc."; 50 51 private static final Pattern REGEX = 52 Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$"); 53 54 public StripWWWRule(String name) { 55 super(name, DESCRIPTION); 56 } 57 58 public String canonicalize(String url, Object context) { 59 return doStripRegexMatch(url, REGEX.matcher(url)); 60 } 61 } 62 | Popular Tags |