KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > net > UURIFactory


1 /* UURIFactory
2  *
3  * $Id: UURIFactory.java,v 1.12.4.1 2007/01/13 01:31:38 stack-sf Exp $
4  *
5  * Created on July 16, 2004
6  *
7  * Copyright (C) 2003 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.net;
26
27 import gnu.inet.encoding.IDNA;
28 import gnu.inet.encoding.IDNAException;
29 import it.unimi.dsi.mg4j.util.MutableString;
30
31 import java.io.UnsupportedEncodingException JavaDoc;
32 import java.util.Arrays JavaDoc;
33 import java.util.BitSet JavaDoc;
34 import java.util.logging.Level JavaDoc;
35 import java.util.logging.Logger JavaDoc;
36 import java.util.regex.Matcher JavaDoc;
37 import java.util.regex.Pattern JavaDoc;
38
39 import org.apache.commons.httpclient.URI;
40 import org.apache.commons.httpclient.URIException;
41 import org.archive.util.TextUtils;
42
43
44 /**
45  * Factory that returns UURIs.
46  *
47  * Does escaping and fixup on URIs massaging in accordance with RFC2396
48  * and to match browser practice. For example, it removes any
49  * '..' if first thing in the path as per IE, converts backslashes to forward
50  * slashes, and discards any 'fragment'/anchor portion of the URI. This
51  * class will also fail URIs if they are longer than IE's allowed maximum
52  * length.
53  *
54  * <p>TODO: Test logging.
55  *
56  * @author stack
57  */

58 public class UURIFactory extends URI {
59     
60     private static final long serialVersionUID = -6146295130382209042L;
61
62     /**
63      * Logging instance.
64      */

65     private static Logger JavaDoc logger =
66         Logger.getLogger(UURIFactory.class.getName());
67     
68     /**
69      * The single instance of this factory.
70      */

71     private static final UURIFactory factory = new UURIFactory();
72     
73     /**
74      * RFC 2396-inspired regex.
75      *
76      * From the RFC Appendix B:
77      * <pre>
78      * URI Generic Syntax August 1998
79      *
80      * B. Parsing a URI Reference with a Regular Expression
81      *
82      * As described in Section 4.3, the generic URI syntax is not sufficient
83      * to disambiguate the components of some forms of URI. Since the
84      * "greedy algorithm" described in that section is identical to the
85      * disambiguation method used by POSIX regular expressions, it is
86      * natural and commonplace to use a regular expression for parsing the
87      * potential four components and fragment identifier of a URI reference.
88      *
89      * The following line is the regular expression for breaking-down a URI
90      * reference into its components.
91      *
92      * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
93      * 12 3 4 5 6 7 8 9
94      *
95      * The numbers in the second line above are only to assist readability;
96      * they indicate the reference points for each subexpression (i.e., each
97      * paired parenthesis). We refer to the value matched for subexpression
98      * <n> as $<n>. For example, matching the above expression to
99      *
100      * http://www.ics.uci.edu/pub/ietf/uri/#Related
101      *
102      * results in the following subexpression matches:
103      *
104      * $1 = http:
105      * $2 = http
106      * $3 = //www.ics.uci.edu
107      * $4 = www.ics.uci.edu
108      * $5 = /pub/ietf/uri/
109      * $6 = <undefined>
110      * $7 = <undefined>
111      * $8 = #Related
112      * $9 = Related
113      *
114      * where <undefined> indicates that the component is not present, as is
115      * the case for the query component in the above example. Therefore, we
116      * can determine the value of the four components and fragment as
117      *
118      * scheme = $2
119      * authority = $4
120      * path = $5
121      * query = $7
122      * fragment = $9
123      * </pre>
124      *
125      * --
126      * <p>Below differs from the rfc regex in that it has java escaping of
127      * regex characters and we allow a URI made of a fragment only (Added extra
128      * group so indexing is off by one after scheme).
129      */

130     final static Pattern JavaDoc RFC2396REGEX = Pattern.compile(
131         "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?");
132     // 12 34 5 6 7 8 9 A
133
// 2 1 54 6 87 3 A9
134
// 1: scheme
135
// 2: scheme:
136
// 3: //authority/path
137
// 4: //authority
138
// 5: authority
139
// 6: path
140
// 7: ?query
141
// 8: query
142
// 9: #fragment
143
// A: fragment
144

145     public static final String JavaDoc SLASHDOTDOTSLASH = "^(/\\.\\./)+";
146     public static final String JavaDoc SLASH = "/";
147     public static final String JavaDoc HTTP = "http";
148     public static final String JavaDoc HTTP_PORT = ":80";
149     public static final String JavaDoc HTTPS = "https";
150     public static final String JavaDoc HTTPS_PORT = ":443";
151     public static final String JavaDoc DOT = ".";
152     public static final String JavaDoc EMPTY_STRING = "";
153     public static final String JavaDoc NBSP = "\u00A0";
154     public static final String JavaDoc SPACE = " ";
155     public static final String JavaDoc ESCAPED_SPACE = "%20";
156     public static final String JavaDoc TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$";
157     public static final String JavaDoc PIPE = "|";
158     public static final String JavaDoc PIPE_PATTERN = "\\|";
159     public static final String JavaDoc ESCAPED_PIPE = "%7C";
160     public static final String JavaDoc CIRCUMFLEX = "^";
161     public static final String JavaDoc CIRCUMFLEX_PATTERN = "\\^";
162     public static final String JavaDoc ESCAPED_CIRCUMFLEX = "%5E";
163     public static final String JavaDoc QUOT = "\"";
164     public static final String JavaDoc ESCAPED_QUOT = "%22";
165     public static final String JavaDoc SQUOT = "'";
166     public static final String JavaDoc ESCAPED_SQUOT = "%27";
167     public static final String JavaDoc APOSTROPH = "`";
168     public static final String JavaDoc ESCAPED_APOSTROPH = "%60";
169     public static final String JavaDoc LSQRBRACKET = "[";
170     public static final String JavaDoc LSQRBRACKET_PATTERN = "\\[";
171     public static final String JavaDoc ESCAPED_LSQRBRACKET = "%5B";
172     public static final String JavaDoc RSQRBRACKET = "]";
173     public static final String JavaDoc RSQRBRACKET_PATTERN = "\\]";
174     public static final String JavaDoc ESCAPED_RSQRBRACKET = "%5D";
175     public static final String JavaDoc LCURBRACKET = "{";
176     public static final String JavaDoc LCURBRACKET_PATTERN = "\\{";
177     public static final String JavaDoc ESCAPED_LCURBRACKET = "%7B";
178     public static final String JavaDoc RCURBRACKET = "}";
179     public static final String JavaDoc RCURBRACKET_PATTERN = "\\}";
180     public static final String JavaDoc ESCAPED_RCURBRACKET = "%7D";
181     public static final String JavaDoc BACKSLASH = "\\";
182     public static final String JavaDoc BACKSLASH_PATTERN = "\\\\";
183     public static final String JavaDoc ESCAPED_BACKSLASH = "%5C";
184     public static final String JavaDoc NEWLINE = "\n+|\r+";
185     public static final String JavaDoc IMPROPERESC_REPLACE = "%25$1";
186     public static final String JavaDoc IMPROPERESC =
187         "%((?:[^\\p{XDigit}])|(?:.[^\\p{XDigit}])|(?:\\z))";
188     public static final String JavaDoc COMMERCIAL_AT = "@";
189     public static final char PERCENT_SIGN = '%';
190     public static final char COLON = ':';
191     
192     /**
193      * First percent sign in string followed by two hex chars.
194      */

195     public static final String JavaDoc URI_HEX_ENCODING =
196         "^[^%]*%[\\p{XDigit}][\\p{XDigit}].*";
197     
198     /**
199      * Authority port number regex.
200      */

201     final static Pattern JavaDoc PORTREGEX = Pattern.compile("(.*:)([0-9]+)$");
202     
203     /**
204      * Characters we'll accept in the domain label part of a URI
205      * authority: ASCII letters-digits-hyphen (LDH) plus underscore,
206      * with single intervening '.' characters.
207      *
208      * (We accept '_' because DNS servers have tolerated for many
209      * years counter to spec; we also accept dash patterns and ACE
210      * prefixes that will be rejected by IDN-punycoding attempt.)
211      */

212     final static String JavaDoc ACCEPTABLE_ASCII_DOMAIN =
213         "^(?:[a-zA-Z0-9_-]++(?:\\.)?)++$";
214     
215     /**
216      * Pattern that looks for case of three or more slashes after the
217      * scheme. If found, we replace them with two only as mozilla does.
218      */

219     final static Pattern JavaDoc HTTP_SCHEME_SLASHES =
220         Pattern.compile("^(https?://)/+(.*)");
221     
222     /**
223      * Pattern that looks for case of two or more slashes in a path.
224      */

225     final static Pattern JavaDoc MULTIPLE_SLASHES = Pattern.compile("//+");
226     
227     /**
228      * System property key for list of supported schemes.
229      */

230     private static final String JavaDoc SCHEMES_KEY = ".schemes";
231     
232     /**
233      * System property key for list of purposefully-ignored schemes.
234      */

235     private static final String JavaDoc IGNORED_SCHEMES_KEY = ".ignored-schemes";
236
237     private String JavaDoc[] schemes = null;
238     private String JavaDoc[] ignoredSchemes = null;
239
240     public static final int IGNORED_SCHEME = 9999999;
241     
242     /**
243      * Protected constructor.
244      */

245     private UURIFactory() {
246         super();
247         String JavaDoc s = System.getProperty(this.getClass().getName() + SCHEMES_KEY);
248         if (s != null && s.length() > 0) {
249             schemes = s.split("[, ]+");
250             Arrays.sort(schemes);
251         }
252         String JavaDoc ignored = System.getProperty(this.getClass().getName() + IGNORED_SCHEMES_KEY);
253         if (ignored != null && ignored.length() > 0) {
254             ignoredSchemes = ignored.split("[, ]+");
255             Arrays.sort(ignoredSchemes);
256         }
257     }
258     
259     /**
260      * @param uri URI as string.
261      * @return An instance of UURI
262      * @throws URIException
263      */

264     public static UURI getInstance(String JavaDoc uri) throws URIException {
265         return UURIFactory.factory.create(uri);
266     }
267     
268     /**
269      * @param uri URI as string.
270      * @param charset Character encoding of the passed uri string.
271      * @return An instance of UURI
272      * @throws URIException
273      */

274     public static UURI getInstance(String JavaDoc uri, String JavaDoc charset)
275             throws URIException {
276         return UURIFactory.factory.create(uri, charset);
277     }
278     
279     /**
280      * @param base Base uri to use resolving passed relative uri.
281      * @param relative URI as string.
282      * @return An instance of UURI
283      * @throws URIException
284      */

285     public static UURI getInstance(UURI base, String JavaDoc relative)
286             throws URIException {
287         return UURIFactory.factory.create(base, relative);
288     }
289     
290     /**
291      * Test of whether passed String has an allowed URI scheme.
292      * First tests if likely scheme suffix. If so, we then test if its one of
293      * the supported schemes.
294      * @param possibleUrl URL string to examine.
295      * @return True if passed string looks like it could be an URL.
296      */

297     public static boolean hasSupportedScheme(String JavaDoc possibleUrl) {
298         boolean hasScheme = UURI.hasScheme(possibleUrl);
299         if (!hasScheme || UURIFactory.factory.schemes == null) {
300             return hasScheme;
301         }
302         String JavaDoc tmpStr = possibleUrl.substring(0, possibleUrl.indexOf(':'));
303         return Arrays.binarySearch(UURIFactory.factory.schemes, tmpStr) >= 0;
304     }
305
306     /**
307      * @param uri URI as string.
308      * @return Instance of UURI.
309      * @throws URIException
310      */

311     private UURI create(String JavaDoc uri) throws URIException {
312         return create(uri, UURI.getDefaultProtocolCharset());
313     }
314     
315     /**
316      * @param uri URI as string.
317      * @param charset Original encoding of the string.
318      * @return Instance of UURI.
319      * @throws URIException
320      */

321     private UURI create(String JavaDoc uri, String JavaDoc charset) throws URIException {
322         UURI uuri = new UURI(fixup(uri, null, charset), true, charset);
323         if (logger.isLoggable(Level.FINE)) {
324             logger.fine("URI " + uri +
325                 " PRODUCT " + uuri.toString() +
326                 " CHARSET " + charset);
327         }
328         return validityCheck(uuri);
329     }
330     
331     /**
332      * @param base UURI to use as a base resolving <code>relative</code>.
333      * @param relative Relative URI.
334      * @return Instance of UURI.
335      * @throws URIException
336      */

337     private UURI create(UURI base, String JavaDoc relative) throws URIException {
338         UURI uuri = new UURI(base, new UURI(fixup(relative, base, base.getProtocolCharset()),
339             true, base.getProtocolCharset()));
340         if (logger.isLoggable(Level.FINE)) {
341             logger.fine(" URI " + relative +
342                 " PRODUCT " + uuri.toString() +
343                 " CHARSET " + base.getProtocolCharset() +
344                 " BASE " + base);
345         }
346         return validityCheck(uuri);
347     }
348
349     /**
350      * Check the generated UURI.
351      *
352      * At the least look at length of uuri string. We were seeing case
353      * where before escaping, string was &lt; MAX_URL_LENGTH but after was
354      * &gt;. Letting out a too-big message was causing us troubles later
355      * down the processing chain.
356      * @param uuri Created uuri to check.
357      * @return The passed <code>uuri</code> so can easily inline this check.
358      * @throws URIException
359      */

360     protected UURI validityCheck(UURI uuri) throws URIException {
361         if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) {
362            throw new URIException("Created (escaped) uuri > " +
363               UURI.MAX_URL_LENGTH +": "+uuri.toString());
364         }
365         return uuri;
366     }
367     
368     /**
369      * Do heritrix fix-up on passed uri string.
370      *
371      * Does heritrix escaping; usually escaping done to make our behavior align
372      * with IEs. This method codifies our experience pulling URIs from the
373      * wilds. Its does all the escaping we want; its output can always be
374      * assumed to be 'escaped' (though perhaps to a laxer standard than the
375      * vanilla HttpClient URI class or official specs might suggest).
376      *
377      * @param uri URI as string.
378      * @param base May be null.
379      * @param e True if the uri is already escaped.
380      * @return A fixed up URI string.
381      * @throws URIException
382      */

383     private String JavaDoc fixup(String JavaDoc uri, final URI base, final String JavaDoc charset)
384     throws URIException {
385         if (uri == null) {
386             throw new NullPointerException JavaDoc();
387         } else if (uri.length() == 0 && base == null) {
388             throw new URIException("URI length is zero (and not relative).");
389         }
390         
391         if (uri.length() > UURI.MAX_URL_LENGTH) {
392             // We check length here and again later after all convertions.
393
throw new URIException("URI length > " + UURI.MAX_URL_LENGTH +
394                 ": " + uri);
395         }
396         
397         // Replace nbsp with normal spaces (so that they get stripped if at
398
// ends, or encoded if in middle)
399
if (uri.indexOf(NBSP) >= 0) {
400             uri = TextUtils.replaceAll(NBSP, uri, SPACE);
401         }
402         
403         // Get rid of any trailing spaces or new-lines.
404
uri = uri.trim();
405         
406         // IE actually converts backslashes to slashes rather than to %5C.
407
// Since URIs that have backslashes usually work only with IE, we will
408
// convert backslashes to slashes as well.
409
// TODO: Maybe we can first convert backslashes by specs and than by IE
410
// so that we fetch both versions.
411
if (uri.indexOf(BACKSLASH) >= 0) {
412             uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH);
413         }
414         
415         // Kill newlines etc
416
uri = TextUtils.replaceAll(NEWLINE, uri, EMPTY_STRING);
417         
418         // Test for the case of more than two slashes after the http(s) scheme.
419
// Replace with two slashes as mozilla does if found.
420
// See [ 788219 ] URI Syntax Errors stop page parsing.
421
Matcher JavaDoc matcher = HTTP_SCHEME_SLASHES.matcher(uri);
422         if (matcher.matches()) {
423             uri = matcher.group(1) + matcher.group(2);
424         }
425
426         // now, minimally escape any whitespace
427
uri = escapeWhitespace(uri);
428         
429         // For further processing, get uri elements. See the RFC2396REGEX
430
// comment above for explaination of group indices used in the below.
431
matcher = RFC2396REGEX.matcher(uri);
432         if (!matcher.matches()) {
433             throw new URIException("Failed parse of " + uri);
434         }
435         String JavaDoc uriScheme = checkUriElementAndLowerCase(matcher.group(2));
436         String JavaDoc uriSchemeSpecificPart = checkUriElement(matcher.group(3));
437         String JavaDoc uriAuthority = checkUriElement(matcher.group(5));
438         String JavaDoc uriPath = checkUriElement(matcher.group(6));
439         String JavaDoc uriQuery = checkUriElement(matcher.group(8));
440         // UNUSED String uriFragment = checkUriElement(matcher.group(10));
441

442         // If a scheme, is it a supported scheme?
443
if (uriScheme != null && uriScheme.length() > 0 &&
444                 this.schemes != null) {
445             if (!(Arrays.binarySearch(schemes,uriScheme)>=0)) {
446                 // unsupported; see if silently ignored
447
if((Arrays.binarySearch(ignoredSchemes,uriScheme)>=0)) {
448                     throw new URIException(
449                             IGNORED_SCHEME, "Ignored scheme: " + uriScheme);
450                 } else {
451                     throw new URIException("Unsupported scheme: " + uriScheme);
452                 }
453             }
454         }
455         
456         // Test if relative URI. If so, need a base to resolve against.
457
if (uriScheme == null || uriScheme.length() <= 0) {
458             if (base == null) {
459                 throw new URIException("Relative URI but no base: " + uri);
460             }
461         } else {
462             checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme,
463                 uriSchemeSpecificPart);
464         }
465         
466         // fixup authority portion: lowercase/IDN-punycode any domain;
467
// remove stray trailing spaces
468
uriAuthority = fixupAuthority(uriAuthority);
469
470         // Do some checks if absolute path.
471
if (uriSchemeSpecificPart != null &&
472                 uriSchemeSpecificPart.startsWith(SLASH)) {
473             if (uriPath != null) {
474                 // Eliminate '..' if its first thing in the path. IE does this.
475
uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath,
476                     SLASH);
477             }
478             // Ensure root URLs end with '/': browsers always send "/"
479
// on the request-line, so we should consider "http://host"
480
// to be "http://host/".
481
if (uriPath == null || EMPTY_STRING.equals(uriPath)) {
482                 uriPath = SLASH;
483             }
484         }
485
486         if (uriAuthority != null) {
487             if (uriScheme != null && uriScheme.length() > 0 &&
488                     uriScheme.equals(HTTP)) {
489                 uriAuthority = checkPort(uriAuthority);
490                 uriAuthority = stripTail(uriAuthority, HTTP_PORT);
491             } else if (uriScheme != null && uriScheme.length() > 0 &&
492                     uriScheme.equals(HTTPS)) {
493                 uriAuthority = checkPort(uriAuthority);
494                 uriAuthority = stripTail(uriAuthority, HTTPS_PORT);
495             }
496             // Strip any prefix dot or tail dots from the authority.
497
uriAuthority = stripTail(uriAuthority, DOT);
498             uriAuthority = stripPrefix(uriAuthority, DOT);
499         } else {
500             // no authority; may be relative. consider stripping scheme
501
// to work-around org.apache.commons.httpclient.URI bug
502
// ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 )
503
if (uriScheme != null && base != null
504                     && uriScheme.equals(base.getScheme())) {
505                 // uriScheme redundant and will only confound httpclient.URI
506
uriScheme = null;
507             }
508         }
509         
510         // Ensure minimal escaping. Use of 'lax' URI and URLCodec
511
// means minimal escaping isn't necessarily complete/consistent.
512
// There is a chance such lax encoding will throw exceptions
513
// later at inconvenient times.
514
//
515
// One reason for these bad escapings -- though not the only --
516
// is that the page is using an encoding other than the ASCII or the
517
// UTF-8 that is our default URI encoding. In this case the parent
518
// class is burping on the passed URL encoding. If the page encoding
519
// was passed into this factory, the encoding seems to be parsed
520
// correctly (See the testEscapedEncoding unit test).
521
//
522
// This fixup may cause us to miss content. There is the charset case
523
// noted above. TODO: Look out for cases where we fail other than for
524
// the above given reason which will be fixed when we address
525
// '[ 913687 ] Make extractors interrogate for charset'.
526

527         uriPath = ensureMinimalEscaping(uriPath, charset);
528         uriQuery = ensureMinimalEscaping(uriQuery, charset,
529             LaxURLCodec.QUERY_SAFE);
530
531         // Preallocate. The '1's and '2's in below are space for ':',
532
// '//', etc. URI characters.
533
MutableString s = new MutableString(
534             ((uriScheme != null)? uriScheme.length(): 0)
535             + 1 // ';'
536
+ ((uriAuthority != null)? uriAuthority.length(): 0)
537             + 2 // '//'
538
+ ((uriPath != null)? uriPath.length(): 0)
539             + 1 // '?'
540
+ ((uriQuery != null)? uriQuery.length(): 0));
541         appendNonNull(s, uriScheme, ":", true);
542         appendNonNull(s, uriAuthority, "//", false);
543         appendNonNull(s, uriPath, "", false);
544         appendNonNull(s, uriQuery, "?", false);
545         return s.toString();
546     }
547     
548     /**
549      * If http(s) scheme, check scheme specific part begins '//'.
550      * @throws URIException
551      * @see http://www.faqs.org/rfcs/rfc1738.html Section 3.1. Common Internet
552      * Scheme Syntax
553      */

554     protected void checkHttpSchemeSpecificPartSlashPrefix(final URI base,
555             final String JavaDoc scheme, final String JavaDoc schemeSpecificPart)
556     throws URIException {
557         // Only apply this check if no base.
558
if (base != null) {
559             return;
560         }
561         if (scheme == null || scheme.length() <= 0) {
562             return;
563         }
564         if (!scheme.equals("http") && !scheme.equals("https")) {
565             return;
566         }
567         if (!schemeSpecificPart.startsWith("//")) {
568             throw new URIException("http scheme specific part must " +
569                 "begin '//': " + schemeSpecificPart);
570         }
571         if (schemeSpecificPart.length() <= 2) {
572             throw new URIException("http scheme specific part is " +
573                 "too short: " + schemeSpecificPart);
574         }
575     }
576     
577     /**
578      * Fixup 'authority' portion of URI, by removing any stray
579      * encoded spaces, lowercasing any domain names, and applying
580      * IDN-punycoding to Unicode domains.
581      *
582      * @param uriAuthority the authority string to fix
583      * @return fixed version
584      * @throws URIException
585      */

586     private String JavaDoc fixupAuthority(String JavaDoc uriAuthority) throws URIException {
587         // Lowercase the host part of the uriAuthority; don't destroy any
588
// userinfo capitalizations. Make sure no illegal characters in
589
// domainlabel substring of the uri authority.
590
if (uriAuthority != null) {
591             // Get rid of any trailing escaped spaces:
592
// http://www.archive.org%20. Rare but happens.
593
// TODO: reevaluate: do IE or firefox do such mid-URI space-removal?
594
// if not, we shouldn't either.
595
while(uriAuthority.endsWith(ESCAPED_SPACE)) {
596                 uriAuthority = uriAuthority.substring(0,uriAuthority.length()-3);
597             }
598
599             // lowercase & IDN-punycode only the domain portion
600
int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
601             int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex);
602             if(atIndex<0 && portColonIndex<0) {
603                 // most common case: neither userinfo nor port
604
return fixupDomainlabel(uriAuthority);
605             } else if (atIndex<0 && portColonIndex>-1) {
606                 // next most common: port but no userinfo
607
String JavaDoc domain = fixupDomainlabel(uriAuthority.substring(0,portColonIndex));
608                 String JavaDoc port = uriAuthority.substring(portColonIndex);
609                 return domain + port;
610             } else if (atIndex>-1 && portColonIndex<0) {
611                 // uncommon: userinfo, no port
612
String JavaDoc userinfo = uriAuthority.substring(0,atIndex+1);
613                 String JavaDoc domain = fixupDomainlabel(uriAuthority.substring(atIndex+1));
614                 return userinfo + domain;
615             } else {
616                 // uncommon: userinfo, port
617
String JavaDoc userinfo = uriAuthority.substring(0,atIndex+1);
618                 String JavaDoc domain = fixupDomainlabel(uriAuthority.substring(atIndex+1,portColonIndex));
619                 String JavaDoc port = uriAuthority.substring(portColonIndex);
620                 return userinfo + domain + port;
621             }
622         }
623         return uriAuthority;
624     }
625     
626     /**
627      * Fixup the domain label part of the authority.
628      *
629      * We're more lax than the spec. in that we allow underscores.
630      *
631      * @param label Domain label to fix.
632      * @return Return fixed domain label.
633      * @throws URIException
634      */

635     private String JavaDoc fixupDomainlabel(String JavaDoc label)
636     throws URIException {
637         
638         // apply IDN-punycoding, as necessary
639
try {
640             // TODO: optimize: only apply when necessary, or
641
// keep cache of recent encodings
642
label = IDNA.toASCII(label);
643         } catch (IDNAException e) {
644             if(TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN,label)) {
645                 // domain name has ACE prefix, leading/trailing dash, or
646
// underscore -- but is still a name we wish to tolerate;
647
// simply continue
648
} else {
649                 // problematic domain: neither ASCII acceptable characters
650
// nor IDN-punycodable, so throw exception
651
// TODO: change to HeritrixURIException so distinguishable
652
// from URIExceptions in library code
653
URIException ue = new URIException(e+" "+label);
654                 ue.initCause(e);
655                 throw ue;
656             }
657         }
658         label = label.toLowerCase();
659         return label;
660     }
661     
662     /**
663      * Ensure that there all characters needing escaping
664      * in the passed-in String are escaped. Stray '%' characters
665      * are *not* escaped, as per browser behavior.
666      *
667      * @param u String to escape
668      * @param charset
669      * @return string with any necessary escaping applied
670      */

671     private String JavaDoc ensureMinimalEscaping(String JavaDoc u, final String JavaDoc charset) {
672         return ensureMinimalEscaping(u, charset, LaxURLCodec.EXPANDED_URI_SAFE);
673     }
674     
675     /**
676      * Ensure that there all characters needing escaping
677      * in the passed-in String are escaped. Stray '%' characters
678      * are *not* escaped, as per browser behavior.
679      *
680      * @param u String to escape
681      * @param charset
682      * @param bitset
683      * @return string with any necessary escaping applied
684      */

685     private String JavaDoc ensureMinimalEscaping(String JavaDoc u, final String JavaDoc charset,
686             final BitSet JavaDoc bitset) {
687         if (u == null) {
688             return null;
689         }
690         for (int i = 0; i < u.length(); i++) {
691             char c = u.charAt(i);
692             if (!bitset.get(c)) {
693                 try {
694                     u = LaxURLCodec.DEFAULT.encode(bitset, u, charset);
695                 } catch (UnsupportedEncodingException JavaDoc e) {
696                     e.printStackTrace();
697                 }
698                 break;
699             }
700         }
701         return u;
702     }
703
704     /**
705      * Escape any whitespace found.
706      *
707      * The parent class takes care of the bulk of escaping. But if any
708      * instance of escaping is found in the URI, then we ask for parent
709      * to do NO escaping. Here we escape any whitespace found irrespective
710      * of whether the uri has already been escaped. We do this for
711      * case where uri has been judged already-escaped only, its been
712      * incompletly done and whitespace remains. Spaces, etc., in the URI are
713      * a real pain. Their presence will break log file and ARC parsing.
714      * @param uri URI string to check.
715      * @return uri with spaces escaped if any found.
716      */

717     protected String JavaDoc escapeWhitespace(String JavaDoc uri) {
718         // Just write a new string anyways. The perl '\s' is not
719
// as inclusive as the Character.isWhitespace so there are
720
// whitespace characters we could miss. So, rather than
721
// write some awkward regex, just go through the string
722
// a character at a time. Only create buffer first time
723
// we find a space.
724
MutableString buffer = null;
725         for (int i = 0; i < uri.length(); i++) {
726             char c = uri.charAt(i);
727             if (Character.isWhitespace(c)) {
728                 if (buffer == null) {
729                     buffer = new MutableString(uri.length() +
730                         2 /*If space, two extra characters (at least)*/);
731                     buffer.append(uri.substring(0, i));
732                 }
733                 buffer.append("%");
734                 String JavaDoc hexStr = Integer.toHexString(c);
735                 if ((hexStr.length() % 2) > 0) {
736                     buffer.append("0");
737                 }
738                 buffer.append(hexStr);
739                 
740             } else {
741                 if (buffer != null) {
742                     buffer.append(c);
743                 }
744             }
745         }
746         return (buffer != null)? buffer.toString(): uri;
747     }
748
749     /**
750      * Check port on passed http authority. Make sure the size is not larger
751      * than allowed: See the 'port' definition on this
752      * page, http://www.kerio.com/manual/wrp/en/418.htm.
753      * Also, we've seen port numbers of '0080' whose leading zeros confuse
754      * the parent class. Strip the leading zeros.
755      *
756      * @param uriAuthority
757      * @return Null or an amended port number.
758      * @throws URIException
759      */

760     private String JavaDoc checkPort(String JavaDoc uriAuthority)
761     throws URIException {
762         Matcher JavaDoc m = PORTREGEX.matcher(uriAuthority);
763         if (m.matches()) {
764             String JavaDoc no = m.group(2);
765             if (no != null && no.length() > 0) {
766                 // First check if the port has leading zeros
767
// as in '0080'. Strip them if it has and
768
// then reconstitute the uriAuthority. Be careful
769
// of cases where port is '0' or '000'.
770
while (no.charAt(0) == '0' && no.length() > 1) {
771                     no = no.substring(1);
772                 }
773                 uriAuthority = m.group(1) + no;
774                 // Now makesure the number is legit.
775
int portNo = Integer.parseInt(no);
776                 if (portNo <= 0 || portNo > 65535) {
777                     throw new URIException("Port out of bounds: " +
778                         uriAuthority);
779                 }
780             }
781         }
782         return uriAuthority;
783     }
784
785     /**
786      * @param b Buffer to append to.
787      * @param str String to append if not null.
788      * @param substr Suffix or prefix to use if <code>str</code> is not null.
789      * @param suffix True if <code>substr</code> is a suffix.
790      */

791     private void appendNonNull(MutableString b, String JavaDoc str, String JavaDoc substr,
792             boolean suffix) {
793         if (str != null && str.length() > 0) {
794             if (!suffix) {
795                 b.append(substr);
796             }
797             b.append(str);
798             if (suffix) {
799                 b.append(substr);
800             }
801         }
802     }
803
804     /**
805      * @param str String to work on.
806      * @param prefix Prefix to strip if present.
807      * @return <code>str</code> w/o <code>prefix</code>.
808      */

809     private String JavaDoc stripPrefix(String JavaDoc str, String JavaDoc prefix) {
810         return str.startsWith(prefix)?
811             str.substring(prefix.length(), str.length()):
812             str;
813     }
814
815     /**
816      * @param str String to work on.
817      * @param tail Tail to strip if present.
818      * @return <code>str</code> w/o <code>tail</code>.
819      */

820     private static String JavaDoc stripTail(String JavaDoc str, String JavaDoc tail) {
821         return str.endsWith(tail)?
822             str.substring(0, str.length() - tail.length()):
823             str;
824     }
825
826     /**
827      * @param element to examine.
828      * @return Null if passed null or an empty string otherwise
829      * <code>element</code>.
830      */

831     private String JavaDoc checkUriElement(String JavaDoc element) {
832         return (element == null || element.length() <= 0)? null: element;
833     }
834
835     /**
836      * @param element to examine and lowercase if non-null.
837      * @return Null if passed null or an empty string otherwise
838      * <code>element</code> lowercased.
839      */

840     private String JavaDoc checkUriElementAndLowerCase(String JavaDoc element) {
841         String JavaDoc tmp = checkUriElement(element);
842         return (tmp != null)? tmp.toLowerCase(): tmp;
843     }
844 }
845
Popular Tags