| 1 23 package org.archive.crawler.url.canonicalize; 24 25 import java.util.regex.Pattern ; 26 27 28 29 34 public class StripUserinfoRule extends BaseRule { 35 36 private static final long serialVersionUID = -4271062607638914996L; 37 38 private static final String DESCRIPTION = "Strip any 'userinfo' found. " + 39 "Use this rule to equate 'http://stack:psswrd@archive.org/index.htm'" + 40 " and 'http://archive.org/index.htm'. The resulting canonicalization" + 41 " returns 'http://archive.org/index.htm'. Removes any userinfo" + 42 " found. Operates on http/https/ftp/ftps schemes only."; 43 44 47 private static final Pattern REGEX = 48 Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$", 49 Pattern.CASE_INSENSITIVE); 50 51 public StripUserinfoRule(String name) { 52 super(name, DESCRIPTION); 53 } 54 55 public String canonicalize(String url, Object context) { 56 return doStripRegexMatch(url, REGEX.matcher(url)); 57 } 58 } 59 | Popular Tags |