1 23 package org.archive.crawler.url; 24 25 import java.io.File ; 26 27 import org.apache.commons.httpclient.URIException; 28 import org.archive.crawler.datamodel.CrawlOrder; 29 import org.archive.crawler.settings.MapType; 30 import org.archive.crawler.settings.XMLSettingsHandler; 31 import org.archive.crawler.url.canonicalize.FixupQueryStr; 32 import org.archive.crawler.url.canonicalize.LowercaseRule; 33 import org.archive.crawler.url.canonicalize.StripSessionIDs; 34 import org.archive.crawler.url.canonicalize.StripUserinfoRule; 35 import org.archive.crawler.url.canonicalize.StripWWWRule; 36 import org.archive.net.UURIFactory; 37 import org.archive.util.TmpDirTestCase; 38 39 44 public class CanonicalizerTest extends TmpDirTestCase { 45 private File orderFile; 46 protected XMLSettingsHandler settingsHandler; 47 48 private MapType rules = null; 49 50 protected void setUp() throws Exception { 51 super.setUp(); 52 this.orderFile = new File (getTmpDir(), this.getClass().getName() + 53 ".order.xml"); 54 this.settingsHandler = new XMLSettingsHandler(orderFile); 55 this.settingsHandler.initialize(); 56 57 this.rules = (MapType)(settingsHandler.getSettingsObject(null)). 58 getModule(CrawlOrder.ATTR_NAME). 59 getAttribute(CrawlOrder.ATTR_RULES); 60 this.rules.addElement(null, new LowercaseRule("lowercase")); 61 this.rules.addElement(null, new StripUserinfoRule("userinfo")); 62 this.rules.addElement(null, new StripWWWRule("www")); 63 this.rules.addElement(null, new StripSessionIDs("ids")); 64 this.rules.addElement(null, new FixupQueryStr("querystr")); 65 } 66 67 public void testCanonicalize() throws URIException { 68 final String scheme = "http://"; 69 final String nonQueryStr = "archive.org/index.html"; 70 final String result = scheme + nonQueryStr; 71 assertTrue("Mangled original", result.equals( 72 Canonicalizer.canonicalize(UURIFactory.getInstance(result), 73 this.rules.iterator(UURIFactory.getInstance(result))))); 74 String tmp = scheme + "www." + nonQueryStr; 75 assertTrue("Mangled www", result.equals( 76 Canonicalizer.canonicalize(UURIFactory.getInstance(tmp), 77 this.rules.iterator(UURIFactory.getInstance(result))))); 78 tmp = scheme + "www." + nonQueryStr + 79 "?jsessionid=01234567890123456789012345678901"; 80 assertTrue("Mangled sessionid", result.equals( 81 Canonicalizer.canonicalize(UURIFactory.getInstance(tmp), 82 this.rules.iterator(UURIFactory.getInstance(result))))); 83 tmp = scheme + "www." + nonQueryStr + 84 "?jsessionid=01234567890123456789012345678901"; 85 assertTrue("Mangled sessionid", result.equals( 86 Canonicalizer.canonicalize(UURIFactory.getInstance(tmp), 87 this.rules.iterator(UURIFactory.getInstance(result))))); 88 } 89 } 90 | Popular Tags |