KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > net > UURIFactoryTest


1 /* UURIFactoryTest
2  *
3  * $Id: UURIFactoryTest.java,v 1.12.4.1 2007/01/13 01:31:38 stack-sf Exp $
4  *
5  * Created on Apr 2, 2004
6  *
7  * Copyright (C) 2004 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25
26 package org.archive.net;
27
28 import java.util.Iterator JavaDoc;
29 import java.util.TreeMap JavaDoc;
30
31 import junit.framework.TestCase;
32
33 import org.apache.commons.httpclient.URIException;
34
35 /**
36  * Test UURIFactory for proper UURI creation across variety of
37  * important/tricky cases.
38  *
39  * Be careful writing this file. Make sure you write it with UTF-8 encoding.
40  *
41  * @author igor stack gojomo
42  */

43 public class UURIFactoryTest extends TestCase {
44     
45     public final void testEscaping() throws URIException {
46         // Note: single quote is not being escaped by URI class.
47
final String JavaDoc ESCAPED_URISTR = "http://archive.org/" +
48             UURIFactory.ESCAPED_SPACE +
49             UURIFactory.ESCAPED_SPACE +
50             UURIFactory.ESCAPED_CIRCUMFLEX +
51             UURIFactory.ESCAPED_QUOT +
52             UURIFactory.SQUOT +
53             UURIFactory.ESCAPED_APOSTROPH +
54             UURIFactory.ESCAPED_LSQRBRACKET +
55             UURIFactory.ESCAPED_RSQRBRACKET +
56             UURIFactory.ESCAPED_LCURBRACKET +
57             UURIFactory.ESCAPED_RCURBRACKET +
58             UURIFactory.SLASH + "a.gif"; // NBSP and SPACE should be trimmed;
59

60         final String JavaDoc URISTR = "http://archive.org/.././" + "\u00A0" +
61             UURIFactory.SPACE + UURIFactory.CIRCUMFLEX +
62             UURIFactory.QUOT + UURIFactory.SQUOT +
63             UURIFactory.APOSTROPH + UURIFactory.LSQRBRACKET +
64             UURIFactory.RSQRBRACKET + UURIFactory.LCURBRACKET +
65             UURIFactory.RCURBRACKET + UURIFactory.BACKSLASH +
66             "test/../a.gif" + "\u00A0" + UURIFactory.SPACE;
67         
68         UURI uuri = UURIFactory.getInstance(URISTR);
69         final String JavaDoc uuriStr = uuri.toString();
70         assertEquals("expected escaping", ESCAPED_URISTR, uuriStr);
71     }
72
73     public final void testUnderscoreMakesPortParseFail() throws URIException {
74         UURI uuri = UURIFactory.getInstance("http://one-two_three:8080/index.html");
75         int port = uuri.getPort();
76         assertTrue("Failed find of port " + uuri, port == 8080);
77     }
78     
79     public final void testRelativeURIWithTwoSlashes() throws URIException {
80         UURI base = UURIFactory.getInstance("http://www.archive.org");
81         UURI uuri = UURIFactory.getInstance(base, "one//index.html");
82         assertTrue("Doesn't do right thing with two slashes " + uuri,
83             uuri.toString().equals(
84                 "http://www.archive.org/one//index.html"));
85     }
86     
87     public final void testTrailingEncodedSpace() throws URIException {
88         UURI uuri = UURIFactory.getInstance("http://www.nps-shoes.co.uk%20");
89         assertTrue("Doesn't strip trailing encoded space 1 " + uuri,
90             uuri.toString().equals("http://www.nps-shoes.co.uk/"));
91         uuri = UURIFactory.getInstance("http://www.nps-shoes.co.uk%20%20%20");
92         assertTrue("Doesn't strip trailing encoded space 2 " + uuri,
93             uuri.toString().equals("http://www.nps-shoes.co.uk/"));
94     }
95     
96     public final void testPort0080is80() throws URIException {
97         UURI uuri = UURIFactory.getInstance("http://archive.org:0080");
98         assertTrue("Doesn't strip leading zeros " + uuri,
99             uuri.toString().equals("http://archive.org/"));
100     }
101     
102 // DISABLING TEST AS PRECURSOR TO ELIMINATION
103
// the problematic input given -- specifically the "%6s" incomplete uri-escape,
104
// shouldn't necessarily be rejected as a bad URI. IE and Firefox, at least,
105
// will attempt to fetch such an URL (getting, in this case against that ad
106
// server, a bad-request error). Ideally, we'd generate exactly the same
107
// request against the server as they do. However, with the most recent
108
// fixup for stray '%' signs, we come close, but not exactly. That's enough
109
// to cause this test to fail (it's not getting the expected exception) but
110
// our almost-URI, which might be what was intended, is better than trying
111
// nothing.
112
// public final void testBadPath() {
113
// String message = null;
114
// try {
115
// UURIFactory.getInstance("http://ads.as4x.tmcs.net/" +
116
// "html.ng/site=cs&pagepos=102&page=home&adsize=1x1&context=" +
117
// "generic&Params.richmedia=yes%26city%3Dseattle%26" +
118
// "rstid%3D2415%26market_id%3D86%26brand%3Dcitysearch" +
119
// "%6state%3DWA");
120
// } catch (URIException e) {
121
// message = e.getMessage();
122
// }
123
// assertNotNull("Didn't get expected exception.", message);
124
// }
125

126     public final void testEscapeEncoding() throws URIException {
127         UURI uuri = UURIFactory.getInstance("http://www.y1y1.com/" +
128             "albums/userpics/11111/normal_%E3%E4%EC%EC%EC.jpg", "windows-1256");
129         uuri.getPath();
130     }
131     
132     public final void testTooLongAfterEscaping() {
133         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc("http://www.archive.org/a/");
134         // Append bunch of spaces. When escaped, they'll triple in size.
135
for (int i = 0; i < 1024; i++) {
136             buffer.append(" ");
137         }
138         buffer.append("/index.html");
139         String JavaDoc message = null;
140         try {
141             UURIFactory.getInstance(buffer.toString());
142         } catch (URIException e) {
143             message = e.getMessage();
144         }
145         assertTrue("Wrong or no exception: " + message, (message != null) &&
146             message.startsWith("Created (escaped) uuri >"));
147     }
148     
149     public final void testFtpUris() throws URIException {
150         final String JavaDoc FTP = "ftp";
151         final String JavaDoc AUTHORITY = "pfbuser:pfbuser@mprsrv.agri.gov.cn";
152         final String JavaDoc PATH = "/clzreceive/";
153         final String JavaDoc uri = FTP + "://" + AUTHORITY + PATH;
154         UURI uuri = UURIFactory.getInstance(uri);
155         assertTrue("Failed to get matching scheme: " + uuri.getScheme(),
156                 (uuri.getScheme()).equals(FTP));
157         assertTrue("Failed to get matching authority: " +
158                 uuri.getAuthority(), (uuri.getAuthority()).equals(AUTHORITY));
159         assertTrue("Failed to get matching path: " +
160                 uuri.getPath(), (uuri.getPath()).equals(PATH));
161     }
162     
163     public final void testWhitespaceEscaped() throws URIException {
164         // Test that we get all whitespace even if the uri is
165
// already escaped.
166
String JavaDoc uri = "http://archive.org/index%25 .html";
167         String JavaDoc tgtUri = "http://archive.org/index%25%20.html";
168         UURI uuri = UURIFactory.getInstance(uri);
169         assertTrue("Not equal " + uuri.toString(),
170                 uuri.toString().equals(tgtUri));
171         uri = "http://archive.org/index%25\t.html";
172         tgtUri = "http://archive.org/index%25%09.html";
173         uuri = UURIFactory.getInstance(uri);
174         assertEquals("whitespace escaping", tgtUri, uuri.toString());
175         uri = "http://archive.org/index%25\u001D.html";
176         tgtUri = "http://archive.org/index%25%1D.html".toLowerCase();
177         uuri = UURIFactory.getInstance(uri);
178         assertEquals("whitespace escaping", tgtUri, uuri.toString());
179         uri = "http://gemini.info.usaid.gov/directory/" +
180             "pbResults.cfm?&urlNameLast=Adamson";
181         tgtUri = "http://gemini.info.usaid.gov/directory/faxResults.cfm?" +
182             "name=Charisse%20+Adamson,&location=RRB%20%20%20%205%2E08%2D006";
183         uuri = UURIFactory.getInstance(UURIFactory.getInstance(uri),
184             "faxResults.cfm?name=Charisse +Adamson,&location=" +
185             "RRB%20%20%20%205%2E08%2D006");
186         assertEquals("whitespace escaping", tgtUri, uuri.toString());
187     }
188     
189 // public final void testFailedGetPath() throws URIException {
190
// final String path = "/RealMedia/ads/" +
191
// "click_lx.ads/%%PAGE%%/%%RAND%%/%%POS%%/%%CAMP%%/empty";
192
// // decoding in getPath will interpret %CA as 8-bit escaped char,
193
// // possibly incomplete
194
// final String uri = "http://ads.nandomedia.com" + path;
195
// final UURI uuri = UURIFactory.getInstance(uri);
196
// String foundPath = uuri.getPath();
197
// assertEquals("unexpected path", path, foundPath);
198
// }
199

200     public final void testDnsHost() throws URIException {
201         String JavaDoc uri = "dns://ads.nandomedia.com:81/one.html";
202         UURI uuri = UURIFactory.getInstance(uri);
203         String JavaDoc host = uuri.getReferencedHost();
204         assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
205         uri = "dns:ads.nandomedia.com";
206         uuri = UURIFactory.getInstance(uri);
207         host = uuri.getReferencedHost();
208         assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
209         uri = "dns:ads.nandomedia.com?a=b";
210         uuri = UURIFactory.getInstance(uri);
211         host = uuri.getReferencedHost();
212         assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
213     }
214     
215     public final void testPercentEscaping() throws URIException {
216         final String JavaDoc uri = "http://archive.org/%a%%%%%.html";
217         // tests indicate firefox (1.0.6) does not encode '%' at all
218
final String JavaDoc tgtUri = "http://archive.org/%a%%%%%.html";
219         UURI uuri = UURIFactory.getInstance(uri);
220         assertEquals("Not equal",tgtUri, uuri.toString());
221     }
222     
223     public final void testRelativeDblPathSlashes() throws URIException {
224         UURI base = UURIFactory.getInstance("http://www.archive.org/index.html");
225         UURI uuri = UURIFactory.getInstance(base, "JIGOU//KYC//INDEX.HTM");
226         assertTrue("Double slash not working " + uuri.toString(),
227                 uuri.getPath().equals("/JIGOU//KYC//INDEX.HTM"));
228     }
229     
230     public final void testRelativeWithScheme() throws URIException {
231         UURI base = UURIFactory.getInstance("http://www.example.com/some/page");
232         UURI uuri = UURIFactory.getInstance(base, "http:boo");
233         assertTrue("Relative with scheme not working " + uuri.toString(),
234                 uuri.toString().equals("http://www.example.com/some/boo"));
235     }
236     
237     public final void testBadBaseResolve() throws URIException {
238         UURI base = UURIFactory.getInstance("http://license.joins.com/board/" +
239             "etc_board_list.asp?board_name=new_main&b_type=&nPage=" +
240             "2&category=G&lic_id=70&site=changeup&g_page=changeup&g_sPage=" +
241             "notice&gate=02");
242         UURIFactory.getInstance(base, "http://www.changeup.com/...</a");
243     }
244     
245     public final void testTilde() throws URIException {
246         noChangeExpected("http://license.joins.com/~igor");
247     }
248     
249     public final void testCurlies() throws URIException {
250         // Firefox allows curlies in the query string portion of a URL only
251
// (converts curlies if they are in the path portion ahead of the
252
// query string).
253
UURI uuri =
254             noChangeExpected("http://license.joins.com/igor?one={curly}");
255         assertEquals(uuri.getQuery(), "one={curly}");
256         assertEquals(UURIFactory.
257                 getInstance("http://license.joins.com/igor{curly}.html").
258                     toString(),
259             "http://license.joins.com/igor%7Bcurly%7D.html");
260         boolean exception = false;
261         try {
262             UURIFactory.getInstance("http://license.{curly}.com/igor.html");
263         } catch (URIException u) {
264             exception = true;
265         }
266         assertTrue("Did not get exception.", exception);
267     }
268     
269     protected UURI noChangeExpected(final String JavaDoc original)
270     throws URIException {
271         UURI uuri = UURIFactory.getInstance(original);
272         assertEquals(original, uuri.toString());
273         return uuri;
274     }
275     
276     public final void testTrimSpaceNBSP() throws URIException {
277         final String JavaDoc uri = " http://archive.org/DIR WITH SPACES/" +
278         UURIFactory.NBSP + "home.html " + UURIFactory.NBSP + " ";
279         final String JavaDoc tgtUri =
280             "http://archive.org/DIR%20WITH%20SPACES/%20home.html";
281         UURI uuri = UURIFactory.getInstance(uri);
282         assertTrue("Not equal " + uuri.toString(),
283                 uuri.toString().equals(tgtUri));
284     }
285     
286     /**
287      * Test space plus encoding ([ 1010966 ] crawl.log has URIs with spaces in them).
288      * See <a HREF="http://sourceforge.net/tracker/index.php?func=detail&aid=1010966&group_id=73833&atid=539099">[ 1010966 ] crawl.log has URIs with spaces in them</a>.
289      * @throws URIException
290      */

291     public final void testSpaceDoubleEncoding() throws URIException {
292         final String JavaDoc uri = "http://www.brook.edu/i.html? %20taxonomy=Politics";
293         final String JavaDoc encodedUri =
294             "http://www.brook.edu/i.html?%20%20taxonomy=Politics";
295         UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1");
296         assertTrue("Not equal " + uuri.toString(),
297                 uuri.toString().equals(encodedUri));
298     }
299     
300     /**
301      * Test for doubly-encoded sequences.
302      * See <a HREF="https://sourceforge.net/tracker/index.php?func=detail&aid=966219&group_id=73833&atid=539099">[ 966219 ] UURI doubly-encodes %XX sequences</a>.
303      * @throws URIException
304      */

305     public final void testDoubleEncoding() throws URIException {
306         final char ae = '\u00E6';
307         final String JavaDoc uri = "http://archive.org/DIR WITH SPACES/home" +
308             ae + ".html";
309         final String JavaDoc encodedUri =
310             "http://archive.org/DIR%20WITH%20SPACES/home%E6.html";
311         UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1");
312         assertEquals("single encoding", encodedUri, uuri.toString());
313         // Dbl-encodes.
314
uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
315         uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
316         assertEquals("double encoding", encodedUri, uuri.toString());
317         // Do default utf-8 test.
318
uuri = UURIFactory.getInstance(uri);
319         final String JavaDoc encodedUtf8Uri =
320             "http://archive.org/DIR%20WITH%20SPACES/home%C3%A6.html";
321         assertEquals("Not equal utf8", encodedUtf8Uri, uuri.toString());
322         // Now dbl-encode.
323
uuri = UURIFactory.getInstance(uuri.toString());
324         uuri = UURIFactory.getInstance(uuri.toString());
325         assertEquals("Not equal (dbl-encoding) utf8", encodedUtf8Uri, uuri.toString());
326     }
327     
328     /**
329      * Test for syntax errors stop page parsing.
330      * @see <a HREF="https://sourceforge.net/tracker/?func=detail&aid=788219&group_id=73833&atid=539099">[ 788219 ] URI Syntax Errors stop page parsing</a>
331      * @throws URIException
332      */

333     public final void testThreeSlashes() throws URIException {
334         UURI goodURI = UURIFactory.
335         getInstance("http://lcweb.loc.gov/rr/goodtwo.html");
336         String JavaDoc uuri = "http:///lcweb.loc.gov/rr/goodtwo.html";
337         UURI rewrittenURI = UURIFactory.getInstance(uuri);
338         assertTrue("Not equal " + goodURI + ", " + uuri,
339                 goodURI.toString().equals(rewrittenURI.toString()));
340         uuri = "http:////lcweb.loc.gov/rr/goodtwo.html";
341         rewrittenURI = UURIFactory.getInstance(uuri);
342         assertTrue("Not equal " + goodURI + ", " + uuri,
343                 goodURI.toString().equals(rewrittenURI.toString()));
344         // Check https.
345
goodURI = UURIFactory.
346         getInstance("https://lcweb.loc.gov/rr/goodtwo.html");
347         uuri = "https:////lcweb.loc.gov/rr/goodtwo.html";
348         rewrittenURI = UURIFactory.getInstance(uuri);
349         assertTrue("Not equal " + goodURI + ", " + uuri,
350                 goodURI.toString().equals(rewrittenURI.toString()));
351     }
352     
353     public final void testNoScheme() {
354         boolean expectedException = false;
355         String JavaDoc uuri = "www.loc.gov/rr/european/egw/polishex.html";
356         try {
357             UURIFactory.getInstance(uuri);
358         } catch (URIException e) {
359             // Expected exception.
360
expectedException = true;
361         }
362         assertTrue("Didn't get expected exception: " + uuri,
363                 expectedException);
364     }
365     
366     public final void testRelative() throws URIException {
367         UURI uuriTgt = UURIFactory.
368         getInstance("http://archive.org:83/home.html");
369         UURI uri = UURIFactory.
370         getInstance("http://archive.org:83/one/two/three.html");
371         UURI uuri = UURIFactory.
372         getInstance(uri, "/home.html");
373         assertTrue("Not equal",
374                 uuriTgt.toString().equals(uuri.toString()));
375     }
376     
377     /**
378      * Test that an empty uuri does the right thing -- that we get back the
379      * base.
380      *
381      * @throws URIException
382      */

383     public final void testRelativeEmpty() throws URIException {
384         UURI uuriTgt = UURIFactory.
385         getInstance("http://archive.org:83/one/two/three.html");
386         UURI uri = UURIFactory.
387         getInstance("http://archive.org:83/one/two/three.html");
388         UURI uuri = UURIFactory.
389         getInstance(uri, "");
390         assertTrue("Empty length don't work",
391                 uuriTgt.toString().equals(uuri.toString()));
392     }
393     
394     public final void testAbsolute() throws URIException {
395         UURI uuriTgt = UURIFactory.
396         getInstance("http://archive.org:83/home.html");
397         UURI uri = UURIFactory.
398         getInstance("http://archive.org:83/one/two/three.html");
399         UURI uuri = UURIFactory.
400         getInstance(uri, "http://archive.org:83/home.html");
401         assertTrue("Not equal",
402                 uuriTgt.toString().equals(uuri.toString()));
403     }
404     
405     /**
406      * Test for [ 962892 ] UURI accepting/creating unUsable URIs (bad hosts).
407      * @see <a HREF="https://sourceforge.net/tracker/?func=detail&atid=539099&aid=962892&group_id=73833">[ 962892 ] UURI accepting/creating unUsable URIs (bad hosts)</a>
408      */

409     public final void testHostWithLessThan() {
410         checkExceptionOnIllegalDomainlabel("http://www.betamobile.com</A");
411         checkExceptionOnIllegalDomainlabel(
412         "http://C|/unzipped/426/spacer.gif");
413         checkExceptionOnIllegalDomainlabel("http://www.lycos.co.uk\"/l/b/\"");
414     }
415     
416     /**
417      * Test for [ 1012520 ] UURI.length() &gt; 2k.
418      * @throws URIException
419      * @see <a HREF="http://sourceforge.net/tracker/index.php?func=detail&aid=1012520&group_id=73833&atid=539099">[ 1012520 ] UURI.length() &gt; 2k</a>
420      */

421     public final void test2kURI() throws URIException {
422         final StringBuffer JavaDoc buffer = new StringBuffer JavaDoc("http://a.b");
423         final String JavaDoc subPath = "/123456789";
424         for (int i = 0; i < 207; i++) {
425             buffer.append(subPath);
426         }
427         // String should be 2080 characters long. Legal.
428
UURIFactory.getInstance(buffer.toString());
429         boolean gotException = false;
430         // Add ten more characters and make size illegal.
431
buffer.append(subPath);
432         try {
433             UURIFactory.getInstance(buffer.toString());
434         } catch (URIException e) {
435             gotException = true;
436         }
437         assertTrue("No expected exception complaining about long URI",
438                 gotException);
439     }
440     
441     private void checkExceptionOnIllegalDomainlabel(String JavaDoc uuri) {
442         boolean expectedException = false;
443         try {
444             UURIFactory.getInstance(uuri);
445         } catch (URIException e) {
446             // Expected exception.
447
expectedException = true;
448         }
449         assertTrue("Didn't get expected exception: " + uuri,
450                 expectedException);
451     }
452     
453     /**
454      * Test for doing separate DNS lookup for same host
455      *
456      * @see <a HREF="https://sourceforge.net/tracker/?func=detail&aid=788277&group_id=73833&atid=539099">[ 788277 ] Doing separate DNS lookup for same host</a>
457      * @throws URIException
458      */

459     public final void testHostWithPeriod() throws URIException {
460         UURI uuri1 = UURIFactory.
461         getInstance("http://www.loc.gov./index.html");
462         UURI uuri2 = UURIFactory.
463         getInstance("http://www.loc.gov/index.html");
464         assertEquals("Failed equating hosts with dot",
465                 uuri1.getHost(), uuri2.getHost());
466     }
467     
468     /**
469      * Test for NPE in java.net.URI.encode
470      *
471      * @see <a HREF="https://sourceforge.net/tracker/?func=detail&aid=874220&group_id=73833&atid=539099">[ 874220 ] NPE in java.net.URI.encode</a>
472      * @throws URIException
473      */

474     public final void testHostEncodedChars() throws URIException {
475         String JavaDoc s = "http://g.msn.co.kr/0nwkokr0/00/19??" +
476         "PS=10274&NC=10009&CE=42&CP=949&HL=" +
477         "&#65533;&#65533;&#65533;?&#65533;&#65533;";
478         assertNotNull("Encoded chars " + s,
479                 UURIFactory.getInstance(s));
480     }
481     
482     /**
483      * Test for java.net.URI parses %20 but getHost null
484      *
485      * See <a HREF="https://sourceforge.net/tracker/?func=detail&aid=927940&group_id=73833&atid=539099">[ 927940 ] java.net.URI parses %20 but getHost null</a>
486      */

487     public final void testSpaceInHost() {
488         boolean expectedException = false;
489         try {
490             UURIFactory.getInstance(
491                     "http://www.local-regions.odpm%20.gov.uk" +
492             "/lpsa/challenge/pdf/propect.pdf");
493         } catch (URIException e) {
494             expectedException = true;
495         }
496         assertTrue("Did not fail with escaped space.", expectedException);
497         
498         expectedException = false;
499         try {
500             UURIFactory.getInstance(
501                     "http://www.local-regions.odpm .gov.uk" +
502             "/lpsa/challenge/pdf/propect.pdf");
503         } catch (URIException e) {
504             expectedException = true;
505         }
506         assertTrue("Did not fail with real space.", expectedException);
507     }
508     
509     /**
510      * Test for java.net.URI chokes on hosts_with_underscores.
511      *
512      * @see <a HREF="https://sourceforge.net/tracker/?func=detail&aid=808270&group_id=73833&atid=539099">[ 808270 ] java.net.URI chokes on hosts_with_underscores</a>
513      * @throws URIException
514      */

515     public final void testHostWithUnderscores() throws URIException {
516         UURI uuri = UURIFactory.getInstance(
517         "http://x_underscore_underscore.2u.com.tw/nonexistent_page.html");
518         assertEquals("Failed get of host with underscore",
519                 "x_underscore_underscore.2u.com.tw", uuri.getHost());
520     }
521     
522     
523     /**
524      * Two dots for igor.
525      */

526     public final void testTwoDots() {
527         boolean expectedException = false;
528         try {
529             UURIFactory.getInstance(
530             "http://x_underscore_underscore..2u.com/nonexistent_page.html");
531         } catch (URIException e) {
532             expectedException = true;
533         }
534         assertTrue("Two dots did not throw exception", expectedException);
535     }
536     
537     /**
538      * Test for java.net.URI#getHost fails when leading digit.
539      *
540      * @see <a HREF="https://sourceforge.net/tracker/?func=detail&aid=910120&group_id=73833&atid=539099">[ 910120 ] java.net.URI#getHost fails when leading digit.</a>
541      * @throws URIException
542      */

543     public final void testHostWithDigit() throws URIException {
544         UURI uuri = UURIFactory.
545         getInstance("http://0204chat.2u.com.tw/nonexistent_page.html");
546         assertEquals("Failed get of host with digit",
547                 "0204chat.2u.com.tw", uuri.getHost());
548     }
549     
550     /**
551      * Test for Constraining java URI class.
552      *
553      * @see <a HREF="https://sourceforge.net/tracker/?func=detail&aid=949548&group_id=73833&atid=539099">[ 949548 ] Constraining java URI class</a>
554      */

555     public final void testPort() {
556         checkBadPort("http://www.tyopaikat.com:a/robots.txt");
557         checkBadPort("http://158.144.21.3:80808/robots.txt");
558         checkBadPort("http://pdb.rutgers.edu:81.rutgers.edu/robots.txt");
559         checkBadPort(
560             "https://webmail.gse.harvard.edu:9100robots.txt/robots.txt");
561         checkBadPort(
562             "https://webmail.gse.harvard.edu:0/robots.txt/robots.txt");
563     }
564     
565     /**
566      * Test bad port throws exception.
567      * @param uri URI with bad port to check.
568      */

569     private void checkBadPort(String JavaDoc uri) {
570         boolean exception = false;
571         try {
572             UURIFactory.getInstance(uri);
573         }
574         catch (URIException e) {
575             exception = true;
576         }
577         assertTrue("Didn't throw exception: " + uri, exception);
578     }
579     
580     /**
581      * Preserve userinfo capitalization.
582      * @throws URIException
583      */

584     public final void testUserinfo() throws URIException {
585         final String JavaDoc authority = "stack:StAcK@www.tyopaikat.com";
586         final String JavaDoc uri = "http://" + authority + "/robots.txt";
587         UURI uuri = UURIFactory.getInstance(uri);
588         assertEquals("Authority not equal", uuri.getAuthority(),
589             authority);
590         /*
591         String tmp = uuri.toString();
592         assertTrue("URI not equal", tmp.equals(uri));
593         */

594     }
595
596     /**
597      * Test user info + port
598      * @throws URIException
599      */

600     public final void testUserinfoPlusPort() throws URIException {
601         final String JavaDoc userInfo = "stack:StAcK";
602         final String JavaDoc authority = "www.tyopaikat.com";
603         final int port = 8080;
604         final String JavaDoc uri = "http://" + userInfo + "@" + authority + ":" + port
605             + "/robots.txt";
606         UURI uuri = UURIFactory.getInstance(uri);
607         assertEquals("Host not equal", authority,uuri.getHost());
608         assertEquals("Userinfo Not equal",userInfo,uuri.getUserinfo());
609         assertEquals("Port not equal",port,uuri.getPort());
610         assertEquals("Authority wrong","stack:StAcK@www.tyopaikat.com:8080",
611                 uuri.getAuthority());
612         assertEquals("AuthorityMinusUserinfo wrong","www.tyopaikat.com:8080",
613                 uuri.getAuthorityMinusUserinfo());
614         
615     }
616     
617     /**
618      * Tests from rfc2396 with amendments to accomodate differences
619      * intentionally added to make our URI handling like IEs.
620      *
621      * <pre>
622      * g:h = g:h
623      * g = http://a/b/c/g
624      * ./g = http://a/b/c/g
625      * g/ = http://a/b/c/g/
626      * /g = http://a/g
627      * //g = http://g
628      * ?y = http://a/b/c/?y
629      * g?y = http://a/b/c/g?y
630      * #s = (current document)#s
631      * g#s = http://a/b/c/g#s
632      * g?y#s = http://a/b/c/g?y#s
633      * ;x = http://a/b/c/;x
634      * g;x = http://a/b/c/g;x
635      * g;x?y#s = http://a/b/c/g;x?y#s
636      * . = http://a/b/c/
637      * ./ = http://a/b/c/
638      * .. = http://a/b/
639      * ../ = http://a/b/
640      * ../g = http://a/b/g
641      * ../.. = http://a/
642      * ../../ = http://a/
643      * ../../g = http://a/g
644      * </pre>
645      *
646      * @throws URIException
647      */

648     public final void testRFC2396Relative() throws URIException {
649         UURI base = UURIFactory.
650         getInstance("http://a/b/c/d;p?q");
651         TreeMap JavaDoc<String JavaDoc,String JavaDoc> m = new TreeMap JavaDoc<String JavaDoc,String JavaDoc>();
652         m.put("..", "http://a/b/");
653         m.put("../", "http://a/b/");
654         m.put("../g", "http://a/b/g");
655         m.put("../..", "http://a/");
656         m.put("../../", "http://a/");
657         m.put("../../g", "http://a/g");
658         m.put("g#s", "http://a/b/c/g#s");
659         m.put("g?y#s ", "http://a/b/c/g?y#s");
660         m.put(";x", "http://a/b/c/;x");
661         m.put("g;x", "http://a/b/c/g;x");
662         m.put("g;x?y#s", "http://a/b/c/g;x?y#s");
663         m.put(".", "http://a/b/c/");
664         m.put("./", "http://a/b/c/");
665         m.put("g", "http://a/b/c/g");
666         m.put("./g", "http://a/b/c/g");
667         m.put("g/", "http://a/b/c/g/");
668         m.put("/g", "http://a/g");
669         m.put("//g", "http://g");
670         m.put("?y", "http://a/b/c/?y");
671         m.put("g?y", "http://a/b/c/g?y");
672         // EXTRAS beyond the RFC set.
673
// TODO: That these resolve to a path of /a/g might be wrong. Perhaps
674
// it should be '/g'?.
675
m.put("/../../../../../../../../g", "http://a/g");
676         m.put("../../../../../../../../g", "http://a/g");
677         m.put("../G", "http://a/b/G");
678         for (Iterator JavaDoc i = m.keySet().iterator(); i.hasNext();) {
679             String JavaDoc key = (String JavaDoc)i.next();
680             String JavaDoc value = (String JavaDoc)m.get(key);
681             UURI uuri = UURIFactory.getInstance(base, key);
682             assertTrue("Unexpected " + key + " " + value + " " + uuri,
683                     uuri.equals(UURIFactory.getInstance(value)));
684         }
685     }
686     
687     /**
688      * A UURI should always be without a 'fragment' segment, which is
689      * unused and irrelevant for network fetches.
690      *
691      * See [ 970666 ] #anchor links not trimmed, and thus recrawled
692      *
693      * @throws URIException
694      */

695     public final void testAnchors() throws URIException {
696         UURI uuri = UURIFactory.
697         getInstance("http://www.example.com/path?query#anchor");
698         assertEquals("Not equal", "http://www.example.com/path?query",
699                 uuri.toString());
700     }
701     
702
703     /**
704      * Ensure that URI strings beginning with a colon are treated
705      * the same as browsers do (as relative, rather than as absolute
706      * with zero-length scheme).
707      *
708      * @throws URIException
709      */

710     public void testStartsWithColon() throws URIException {
711         UURI base = UURIFactory.getInstance("http://www.example.com/path/page");
712         UURI uuri = UURIFactory.getInstance(base,":foo");
713         assertEquals("derelativize starsWithColon",
714                 uuri.getURI(),
715                 "http://www.example.com/path/:foo");
716     }
717     
718     /**
719      * Ensure that stray trailing '%' characters do not prevent
720      * UURI instances from being created, and are reasonably
721      * escaped when encountered.
722      *
723      * @throws URIException
724      */

725     public void testTrailingPercents() throws URIException {
726         String JavaDoc plainPath = "http://www.example.com/path%";
727         UURI plainPathUuri = UURIFactory.getInstance(plainPath);
728         assertEquals("plainPath getURI", plainPath, plainPathUuri.getURI());
729         assertEquals("plainPath getEscapedURI",
730                 "http://www.example.com/path%", // browsers don't escape '%'
731
plainPathUuri.getEscapedURI());
732         
733         String JavaDoc partiallyEscapedPath = "http://www.example.com/pa%20th%";
734         UURI partiallyEscapedPathUuri = UURIFactory.getInstance(
735                 partiallyEscapedPath);
736 // assertEquals("partiallyEscapedPath getURI",
737
// "http://www.example.com/pa th%", // TODO: is this desirable?
738
//// partiallyEscapedPath,
739
// partiallyEscapedPathUuri.getURI());
740
assertEquals("partiallyEscapedPath getEscapedURI",
741                 "http://www.example.com/pa%20th%",
742                 partiallyEscapedPathUuri.getEscapedURI());
743         
744         String JavaDoc plainQueryString = "http://www.example.com/path?q=foo%";
745         UURI plainQueryStringUuri = UURIFactory.getInstance(
746                 plainQueryString);
747 // assertEquals("plainQueryString getURI",
748
// plainQueryString,
749
// plainQueryStringUuri.getURI());
750
assertEquals("plainQueryString getEscapedURI",
751                 "http://www.example.com/path?q=foo%",
752                 plainQueryStringUuri.getEscapedURI());
753         
754         String JavaDoc partiallyEscapedQueryString =
755             "http://www.example.com/pa%20th?q=foo%";
756         UURI partiallyEscapedQueryStringUuri = UURIFactory.getInstance(
757                 partiallyEscapedQueryString);
758         assertEquals("partiallyEscapedQueryString getURI",
759                 "http://www.example.com/pa th?q=foo%",
760                 partiallyEscapedQueryStringUuri.getURI());
761         assertEquals("partiallyEscapedQueryString getEscapedURI",
762                 "http://www.example.com/pa%20th?q=foo%",
763                 partiallyEscapedQueryStringUuri.getEscapedURI());
764     }
765     
766     /**
767      * Ensure that stray '%' characters do not prevent
768      * UURI instances from being created, and are reasonably
769      * escaped when encountered.
770      *
771      * @throws URIException
772      */

773     public void testStrayPercents() throws URIException {
774         String JavaDoc oneStray = "http://www.example.com/pa%th";
775         UURI oneStrayUuri = UURIFactory.getInstance(oneStray);
776         assertEquals("oneStray getURI", oneStray, oneStrayUuri.getURI());
777         assertEquals("oneStray getEscapedURI",
778                 "http://www.example.com/pa%th", // browsers don't escape '%'
779
oneStrayUuri.getEscapedURI());
780         
781         String JavaDoc precededByValidEscape = "http://www.example.com/pa%20th%way";
782         UURI precededByValidEscapeUuri = UURIFactory.getInstance(
783                 precededByValidEscape);
784         assertEquals("precededByValidEscape getURI",
785                 "http://www.example.com/pa th%way", // getURI interprets escapes
786
precededByValidEscapeUuri.getURI());
787         assertEquals("precededByValidEscape getEscapedURI",
788                 "http://www.example.com/pa%20th%way",
789                 precededByValidEscapeUuri.getEscapedURI());
790         
791         String JavaDoc followedByValidEscape = "http://www.example.com/pa%th%20way";
792         UURI followedByValidEscapeUuri = UURIFactory.getInstance(
793                 followedByValidEscape);
794         assertEquals("followedByValidEscape getURI",
795                 "http://www.example.com/pa%th way", // getURI interprets escapes
796
followedByValidEscapeUuri.getURI());
797         assertEquals("followedByValidEscape getEscapedURI",
798                 "http://www.example.com/pa%th%20way",
799                 followedByValidEscapeUuri.getEscapedURI());
800     }
801     
802     public void testEscapingNotNecessary() throws URIException {
803         String JavaDoc escapesUnnecessary =
804             "http://www.example.com/misc;reserved:chars@that&don't=need"
805             +"+escaping$even,though!you(might)initially?think#so";
806         // expect everything but the #fragment
807
String JavaDoc expected = escapesUnnecessary.substring(0, escapesUnnecessary
808                 .length() - 3);
809         assertEquals("escapes unnecessary",
810                 expected,
811                 UURIFactory.getInstance(escapesUnnecessary).toString());
812     }
813     
814     public void testIdn() throws URIException {
815         // See http://www.josefsson.org/idn.php.
816
String JavaDoc idn1 = new String JavaDoc("http://räksmörgås.josefßon.org/");
817         String JavaDoc puny1 = "http://xn--rksmrgs-5wao1o.josefsson.org/";
818         assertEquals("encoding of " + idn1, puny1, UURIFactory
819                 .getInstance(idn1).toString());
820         String JavaDoc idn2 = "http://www.pølse.dk/";
821         String JavaDoc puny2 = "http://www.xn--plse-gra.dk/";
822         assertEquals("encoding of " + idn2, puny2, UURIFactory
823                 .getInstance(idn2).toString());
824     }
825     
826     public void testNewLineInURL() throws URIException {
827         UURI uuri = UURIFactory.getInstance("http://www.ar\rchive\n." +
828             "org/i\n\n\r\rndex.html");
829         assertEquals("http://www.archive.org/index.html", uuri.toString());
830     }
831     
832     public void testQueryEscaping() throws URIException {
833         UURI uuri = UURIFactory.getInstance(
834             "http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'\";:/?.>,<");
835         assertEquals(
836             // tests in FF1.5 indicate it only escapes " < >
837
"http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'%22;:/?.%3E,%3C",
838             uuri.toString());
839     }
840     
841     /**
842      * Check that our 'normalization' does same as Nutch's
843      * Below before-and-afters were taken from the nutch urlnormalizer-basic
844      * TestBasicURLNormalizer class (December 2006, Nutch 0.9-dev).
845      * @throws URIException
846      */

847     public void testSameAsNutchURLFilterBasic() throws URIException {
848         assertEquals(UURIFactory.getInstance(" http://foo.com/ ").toString(),
849             "http://foo.com/");
850
851         // check that protocol is lower cased
852
assertEquals(UURIFactory.getInstance("HTTP://foo.com/").toString(),
853             "http://foo.com/");
854         
855         // check that host is lower cased
856
assertEquals(UURIFactory.
857                 getInstance("http://Foo.Com/index.html").toString(),
858             "http://foo.com/index.html");
859         assertEquals(UURIFactory.
860                 getInstance("http://Foo.Com/index.html").toString(),
861             "http://foo.com/index.html");
862
863         // check that port number is normalized
864
assertEquals(UURIFactory.
865                 getInstance("http://foo.com:80/index.html").toString(),
866             "http://foo.com/index.html");
867         assertEquals(UURIFactory.getInstance("http://foo.com:81/").toString(),
868             "http://foo.com:81/");
869
870         // check that null path is normalized
871
assertEquals(UURIFactory.getInstance("http://foo.com").toString(),
872             "http://foo.com/");
873
874         // check that references are removed
875
assertEquals(UURIFactory.
876                 getInstance("http://foo.com/foo.html#ref").toString(),
877             "http://foo.com/foo.html");
878
879         // // check that encoding is normalized
880
// normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
881

882         // check that unnecessary "../" are removed
883
assertEquals(UURIFactory.
884                 getInstance("http://foo.com/aa/../").toString(),
885             "http://foo.com/" );
886         assertEquals(UURIFactory.
887                 getInstance("http://foo.com/aa/bb/../").toString(),
888             "http://foo.com/aa/");
889
890         /* We fail this one. Here we produce: 'http://foo.com/'.
891         assertEquals(UURIFactory.
892                 getInstance("http://foo.com/aa/..").toString(),
893             "http://foo.com/aa/..");
894          */

895         
896         assertEquals(UURIFactory.
897             getInstance("http://foo.com/aa/bb/cc/../../foo.html").toString(),
898                 "http://foo.com/aa/foo.html");
899         assertEquals(UURIFactory.
900             getInstance("http://foo.com/aa/bb/../cc/dd/../ee/foo.html").
901                 toString(),
902                     "http://foo.com/aa/cc/ee/foo.html");
903         assertEquals(UURIFactory.
904             getInstance("http://foo.com/../foo.html").toString(),
905                 "http://foo.com/foo.html" );
906         assertEquals(UURIFactory.
907             getInstance("http://foo.com/../../foo.html").toString(),
908                 "http://foo.com/foo.html" );
909         assertEquals(UURIFactory.
910             getInstance("http://foo.com/../aa/../foo.html").toString(),
911                 "http://foo.com/foo.html" );
912         assertEquals(UURIFactory.
913             getInstance("http://foo.com/aa/../../foo.html").toString(),
914                 "http://foo.com/foo.html" );
915         assertEquals(UURIFactory.
916                 getInstance("http://foo.com/aa/../bb/../foo.html/../../").
917                     toString(),
918             "http://foo.com/" );
919         assertEquals(UURIFactory.getInstance("http://foo.com/../aa/foo.html").
920             toString(), "http://foo.com/aa/foo.html" );
921         assertEquals(UURIFactory.
922                 getInstance("http://foo.com/../aa/../foo.html").toString(),
923             "http://foo.com/foo.html" );
924         assertEquals(UURIFactory.
925                 getInstance("http://foo.com/a..a/foo.html").toString(),
926             "http://foo.com/a..a/foo.html" );
927         assertEquals(UURIFactory.
928                 getInstance("http://foo.com/a..a/../foo.html").toString(),
929             "http://foo.com/foo.html" );
930         assertEquals(UURIFactory.
931             getInstance("http://foo.com/foo.foo/../foo.html").toString(),
932                  "http://foo.com/foo.html" );
933     }
934     
935     public void testHttpSchemeColonSlash() {
936         boolean exception = false;
937         try {
938             UURIFactory.getInstance("https:/");
939         } catch (URIException e) {
940             exception = true;
941         }
942         assertTrue("Didn't throw exception when one expected", exception);
943         exception = false;
944         try {
945             UURIFactory.getInstance("http://");
946         } catch (URIException e) {
947             exception = true;
948         }
949         assertTrue("Didn't throw exception when one expected", exception);
950     }
951 }
952
Popular Tags