KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > url > CanonicalizerTest


1 /* CanonicalizerTest
2  *
3  * Created on Oct 7, 2004
4  *
5  * Copyright (C) 2004 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.url;
24
25 import java.io.File JavaDoc;
26
27 import org.apache.commons.httpclient.URIException;
28 import org.archive.crawler.datamodel.CrawlOrder;
29 import org.archive.crawler.settings.MapType;
30 import org.archive.crawler.settings.XMLSettingsHandler;
31 import org.archive.crawler.url.canonicalize.FixupQueryStr;
32 import org.archive.crawler.url.canonicalize.LowercaseRule;
33 import org.archive.crawler.url.canonicalize.StripSessionIDs;
34 import org.archive.crawler.url.canonicalize.StripUserinfoRule;
35 import org.archive.crawler.url.canonicalize.StripWWWRule;
36 import org.archive.net.UURIFactory;
37 import org.archive.util.TmpDirTestCase;
38
39 /**
40  * Test canonicalization.
41  * @author stack
42  * @version $Date: 2007/01/13 01:31:28 $, $Revision: 1.7.16.1 $
43  */

44 public class CanonicalizerTest extends TmpDirTestCase {
45     private File JavaDoc orderFile;
46     protected XMLSettingsHandler settingsHandler;
47
48     private MapType rules = null;
49     
50     protected void setUp() throws Exception JavaDoc {
51         super.setUp();
52         this.orderFile = new File JavaDoc(getTmpDir(), this.getClass().getName() +
53             ".order.xml");
54         this.settingsHandler = new XMLSettingsHandler(orderFile);
55         this.settingsHandler.initialize();
56         
57         this.rules = (MapType)(settingsHandler.getSettingsObject(null)).
58             getModule(CrawlOrder.ATTR_NAME).
59                getAttribute(CrawlOrder.ATTR_RULES);
60         this.rules.addElement(null, new LowercaseRule("lowercase"));
61         this.rules.addElement(null, new StripUserinfoRule("userinfo"));
62         this.rules.addElement(null, new StripWWWRule("www"));
63         this.rules.addElement(null, new StripSessionIDs("ids"));
64         this.rules.addElement(null, new FixupQueryStr("querystr"));
65     }
66     
67     public void testCanonicalize() throws URIException {
68         final String JavaDoc scheme = "http://";
69         final String JavaDoc nonQueryStr = "archive.org/index.html";
70         final String JavaDoc result = scheme + nonQueryStr;
71         assertTrue("Mangled original", result.equals(
72             Canonicalizer.canonicalize(UURIFactory.getInstance(result),
73                 this.rules.iterator(UURIFactory.getInstance(result)))));
74         String JavaDoc tmp = scheme + "www." + nonQueryStr;
75         assertTrue("Mangled www", result.equals(
76             Canonicalizer.canonicalize(UURIFactory.getInstance(tmp),
77                 this.rules.iterator(UURIFactory.getInstance(result)))));
78         tmp = scheme + "www." + nonQueryStr +
79             "?jsessionid=01234567890123456789012345678901";
80         assertTrue("Mangled sessionid", result.equals(
81             Canonicalizer.canonicalize(UURIFactory.getInstance(tmp),
82                 this.rules.iterator(UURIFactory.getInstance(result)))));
83         tmp = scheme + "www." + nonQueryStr +
84             "?jsessionid=01234567890123456789012345678901";
85         assertTrue("Mangled sessionid", result.equals(
86              Canonicalizer.canonicalize(UURIFactory.getInstance(tmp),
87                    this.rules.iterator(UURIFactory.getInstance(result)))));
88     }
89 }
90
Popular Tags