1 23 package org.archive.crawler.url.canonicalize; 24 25 26 27 28 33 public class FixupQueryStr 34 extends BaseRule { 35 36 private static final long serialVersionUID = 3169526832544474794L; 37 38 private static final String DESCRIPTION = 39 "Fixup the question mark that leads off the query string. " + 40 "This rule returns 'http://www.archive.org/index.html' if passed" + 41 " 'http://www.archive.org/index.html?'. It will also strip '?&'" + 42 " if '?&' is all that comprises the query string. Also strips" + 43 " extraneous leading '&': Returns 'http://archive.org/index.html?x=y" + 44 " if passed 'http://archive.org/index.html?&x=y." + 45 " Will also strip '&' if last thing in query string." + 46 " Operates on all schemes. This is a good rule to run toward the" + 47 " end of canonicalization processing."; 48 49 public FixupQueryStr(String name) { 50 super(name, DESCRIPTION); 51 } 52 53 public String canonicalize(String url, Object context) { 54 if (url == null || url.length() <= 0) { 55 return url; 56 } 57 58 int index = url.lastIndexOf('?'); 59 if (index > 0) { 60 if (index == (url.length() - 1)) { 61 url = url.substring(0, url.length() - 1); 63 } else if (url.charAt(index + 1) == '&') { 64 if (url.length() == (index + 2)) { 66 url = url.substring(0, url.length() - 2); 68 } else { 69 url = url.substring(0, index + 1) + 71 url.substring(index + 2); 72 } 73 } else if (url.charAt(url.length() - 1) == '&') { 74 url = url.substring(0, url.length() - 1); 77 } 78 } 79 return url; 80 } 81 } 82 | Popular Tags |