1 23 package org.archive.crawler.url.canonicalize; 24 25 import java.io.File ; 26 27 import javax.management.InvalidAttributeValueException ; 28 29 import org.apache.commons.httpclient.URIException; 30 import org.archive.crawler.datamodel.CrawlOrder; 31 import org.archive.crawler.settings.MapType; 32 import org.archive.crawler.settings.XMLSettingsHandler; 33 import org.archive.net.UURIFactory; 34 import org.archive.util.TmpDirTestCase; 35 36 37 42 public class RegexRuleTest extends TmpDirTestCase { 43 private File orderFile; 44 protected XMLSettingsHandler settingsHandler; 45 private MapType rules = null; 46 47 protected void setUp() throws Exception { 48 super.setUp(); 49 this.orderFile = new File (getTmpDir(), this.getClass().getName() + 50 ".order.xml"); 51 this.settingsHandler = new XMLSettingsHandler(orderFile); 52 this.settingsHandler.initialize(); 53 this.rules = (MapType)(settingsHandler.getSettingsObject(null)). 54 getModule(CrawlOrder.ATTR_NAME). 55 getAttribute(CrawlOrder.ATTR_RULES); 56 } 57 58 public void testCanonicalize() 59 throws URIException, InvalidAttributeValueException { 60 final String url = "http://www.aRchive.Org/index.html"; 61 RegexRule rr = new RegexRule("Test " + this.getClass().getName()); 62 this.rules.addElement(null, rr); 63 rr.canonicalize(url, UURIFactory.getInstance(url)); 64 String product = rr.canonicalize(url, null); 65 assertTrue("Default doesn't work.", url.equals(product)); 66 } 67 68 public void testSessionid() 69 throws InvalidAttributeValueException { 70 final String urlBase = "http://joann.com/catalog.jhtml"; 71 final String urlMinusSessionid = urlBase + "?CATID=96029"; 72 final String url = urlBase + 73 ";$sessionid$JKOFFNYAAKUTIP4SY5NBHOR50LD3OEPO?CATID=96029"; 74 RegexRule rr = new RegexRule("Test", 75 "^(.+)(?:;\\$sessionid\\$[A-Z0-9]{32})(\\?.*)+$", 76 "$1$2"); 77 this.rules.addElement(null, rr); 78 String product = rr.canonicalize(url, null); 79 assertTrue("Failed " + url, urlMinusSessionid.equals(product)); 80 } 81 82 public void testNullFormat() 83 throws InvalidAttributeValueException { 84 final String urlBase = "http://joann.com/catalog.jhtml"; 85 final String url = urlBase + 86 ";$sessionid$JKOFFNYAAKUTIP4SY5NBHOR50LD3OEPO"; 87 RegexRule rr = new RegexRule("Test", 88 "^(.+)(?:;\\$sessionid\\$[A-Z0-9]{32})$", 89 "$1$2"); 90 this.rules.addElement(null, rr); 91 String product = rr.canonicalize(url, null); 92 assertTrue("Failed " + url, urlBase.equals(product)); 93 } 94 } 95 | Popular Tags |