1 23 package org.archive.crawler.url.canonicalize; 24 25 import java.util.regex.Pattern ; 26 27 28 33 public class StripSessionIDs 34 extends BaseRule { 35 36 private static final long serialVersionUID = -3737115200690525641L; 37 38 private static final String DESCRIPTION = "Strip known session IDs. " + 39 "Use this rule to remove all of a set of known session IDs." + 40 " For example, this rule will strip JSESSIONID and its value from" + 41 " 'http://archive.org/index.html?" + 42 "JSESSIONID=DDDSSE233232333355FFSXXXXDSDSDS'. The resulting" + 43 " canonicalization returns 'http://archive.org/index.html'." + 44 " This rule strips JSESSIONID, ASPSESSIONID, PHPSESSID, and 'sid'" + 45 " session ids."; 46 47 51 private static final Pattern BASE_PATTERN = Pattern.compile("^(.+)" + 52 "(?:(?:(?:jsessionid)|(?:phpsessid))=" + 53 "[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE); 54 55 62 private static final Pattern SID_PATTERN = 63 Pattern.compile("^(.+)" + 64 "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE); 65 66 69 private static final Pattern ASPSESSION_PATTERN = 70 Pattern.compile("^(.+)" + 71 "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", 72 Pattern.CASE_INSENSITIVE); 73 74 75 public StripSessionIDs(String name) { 76 super(name, DESCRIPTION); 77 } 78 79 public String canonicalize(String url, Object context) { 80 url = doStripRegexMatch(url, BASE_PATTERN.matcher(url)); 81 url = doStripRegexMatch(url, SID_PATTERN.matcher(url)); 82 url = doStripRegexMatch(url, ASPSESSION_PATTERN.matcher(url)); 83 return url; 84 } 85 } | Popular Tags |