UURIFactory


1   /* UURIFactory
2    *
3    * $Id: UURIFactory.java,v 1.12.4.1 2007/01/13 01:31:38 stack-sf Exp $
4    *
5    * Created on July 16, 2004
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.net;
26  
27  import gnu.inet.encoding.IDNA;
28  import gnu.inet.encoding.IDNAException;
29  import it.unimi.dsi.mg4j.util.MutableString;
30  
31  import java.io.UnsupportedEncodingException  ;
32  import java.util.Arrays  ;
33  import java.util.BitSet  ;
34  import java.util.logging.Level  ;
35  import java.util.logging.Logger  ;
36  import java.util.regex.Matcher  ;
37  import java.util.regex.Pattern  ;
38  
39  import org.apache.commons.httpclient.URI;
40  import org.apache.commons.httpclient.URIException;
41  import org.archive.util.TextUtils;
42  
43  
44  /**
45   * Factory that returns UURIs.
46   * 
47   * Does escaping and fixup on URIs massaging in accordance with RFC2396
48   * and to match browser practice. For example, it removes any
49   * '..' if first thing in the path as per IE,  converts backslashes to forward
50   * slashes, and discards any 'fragment'/anchor portion of the URI. This
51   * class will also fail URIs if they are longer than IE's allowed maximum
52   * length.
53   * 
54   * <p>TODO: Test logging.
55   * 
56   * @author stack
57   */
58  public class UURIFactory extends URI {
59      
60      private static final long serialVersionUID = -6146295130382209042L;
61  
62      /**
63       * Logging instance.
64       */
65      private static Logger   logger =
66          Logger.getLogger(UURIFactory.class.getName());
67      
68      /**
69       * The single instance of this factory.
70       */
71      private static final UURIFactory factory = new UURIFactory();
72      
73      /**
74       * RFC 2396-inspired regex.
75       *
76       * From the RFC Appendix B:
77       * <pre>
78       * URI Generic Syntax                August 1998
79       *
80       * B. Parsing a URI Reference with a Regular Expression
81       *
82       * As described in Section 4.3, the generic URI syntax is not sufficient
83       * to disambiguate the components of some forms of URI.  Since the
84       * "greedy algorithm" described in that section is identical to the
85       * disambiguation method used by POSIX regular expressions, it is
86       * natural and commonplace to use a regular expression for parsing the
87       * potential four components and fragment identifier of a URI reference.
88       *
89       * The following line is the regular expression for breaking-down a URI
90       * reference into its components.
91       *
92       * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
93       * 12            3  4          5       6  7        8 9
94       *
95       * The numbers in the second line above are only to assist readability;
96       * they indicate the reference points for each subexpression (i.e., each
97       * paired parenthesis).  We refer to the value matched for subexpression
98       * <n> as $<n>.  For example, matching the above expression to
99       *
100      * http://www.ics.uci.edu/pub/ietf/uri/#Related
101      *
102      * results in the following subexpression matches:
103      *
104      * $1 = http:
105      * $2 = http
106      * $3 = //www.ics.uci.edu
107      * $4 = www.ics.uci.edu
108      * $5 = /pub/ietf/uri/
109      * $6 = <undefined>
110      * $7 = <undefined>
111      * $8 = #Related
112      * $9 = Related
113      *
114      * where <undefined> indicates that the component is not present, as is
115      * the case for the query component in the above example.  Therefore, we
116      * can determine the value of the four components and fragment as
117      *
118      * scheme    = $2
119      * authority = $4
120      * path      = $5
121      * query     = $7
122      * fragment  = $9
123      * </pre>
124      *
125      * -- 
126      * <p>Below differs from the rfc regex in that it has java escaping of
127      * regex characters and we allow a URI made of a fragment only (Added extra
128      * group so indexing is off by one after scheme).
129      */
130     final static Pattern   RFC2396REGEX = Pattern.compile(
131         "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?");
132     //    12            34  5          6       7   8          9 A
133     //              2 1             54        6          87 3      A9
134     // 1: scheme
135     // 2: scheme:
136     // 3: //authority/path
137     // 4: //authority
138     // 5: authority
139     // 6: path
140     // 7: ?query
141     // 8: query 
142     // 9: #fragment
143     // A: fragment
144 
145     public static final String   SLASHDOTDOTSLASH = "^(/\\.\\./)+";
146     public static final String   SLASH = "/";
147     public static final String   HTTP = "http";
148     public static final String   HTTP_PORT = ":80";
149     public static final String   HTTPS = "https";
150     public static final String   HTTPS_PORT = ":443";
151     public static final String   DOT = ".";
152     public static final String   EMPTY_STRING = "";
153     public static final String   NBSP = "\u00A0";
154     public static final String   SPACE = " ";
155     public static final String   ESCAPED_SPACE = "%20";
156     public static final String   TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$";
157     public static final String   PIPE = "|";
158     public static final String   PIPE_PATTERN = "\\|";
159     public static final String   ESCAPED_PIPE = "%7C";
160     public static final String   CIRCUMFLEX = "^";
161     public static final String   CIRCUMFLEX_PATTERN = "\\^";
162     public static final String   ESCAPED_CIRCUMFLEX = "%5E";
163     public static final String   QUOT = "\"";
164     public static final String   ESCAPED_QUOT = "%22";
165     public static final String   SQUOT = "'";
166     public static final String   ESCAPED_SQUOT = "%27";
167     public static final String   APOSTROPH = "`";
168     public static final String   ESCAPED_APOSTROPH = "%60";
169     public static final String   LSQRBRACKET = "[";
170     public static final String   LSQRBRACKET_PATTERN = "\\[";
171     public static final String   ESCAPED_LSQRBRACKET = "%5B";
172     public static final String   RSQRBRACKET = "]";
173     public static final String   RSQRBRACKET_PATTERN = "\\]";
174     public static final String   ESCAPED_RSQRBRACKET = "%5D";
175     public static final String   LCURBRACKET = "{";
176     public static final String   LCURBRACKET_PATTERN = "\\{";
177     public static final String   ESCAPED_LCURBRACKET = "%7B";
178     public static final String   RCURBRACKET = "}";
179     public static final String   RCURBRACKET_PATTERN = "\\}";
180     public static final String   ESCAPED_RCURBRACKET = "%7D";
181     public static final String   BACKSLASH = "\\";
182     public static final String   BACKSLASH_PATTERN = "\\\\";
183     public static final String   ESCAPED_BACKSLASH = "%5C";
184     public static final String   NEWLINE = "\n+|\r+";
185     public static final String   IMPROPERESC_REPLACE = "%25$1";
186     public static final String   IMPROPERESC =
187         "%((?:[^\\p{XDigit}])|(?:.[^\\p{XDigit}])|(?:\\z))";
188     public static final String   COMMERCIAL_AT = "@";
189     public static final char PERCENT_SIGN = '%';
190     public static final char COLON = ':';
191     
192     /**
193      * First percent sign in string followed by two hex chars.
194      */
195     public static final String   URI_HEX_ENCODING =
196         "^[^%]*%[\\p{XDigit}][\\p{XDigit}].*";
197     
198     /**
199      * Authority port number regex.
200      */
201     final static Pattern   PORTREGEX = Pattern.compile("(.*:)([0-9]+)$");
202     
203     /**
204      * Characters we'll accept in the domain label part of a URI
205      * authority: ASCII letters-digits-hyphen (LDH) plus underscore,
206      * with single intervening '.' characters.
207      * 
208      * (We accept '_' because DNS servers have tolerated for many
209      * years counter to spec; we also accept dash patterns and ACE
210      * prefixes that will be rejected by IDN-punycoding attempt.)
211      */
212     final static String   ACCEPTABLE_ASCII_DOMAIN =
213         "^(?:[a-zA-Z0-9_-]++(?:\\.)?)++$";
214     
215     /**
216      * Pattern that looks for case of three or more slashes after the 
217      * scheme.  If found, we replace them with two only as mozilla does.
218      */
219     final static Pattern   HTTP_SCHEME_SLASHES =
220         Pattern.compile("^(https?://)/+(.*)");
221     
222     /**
223      * Pattern that looks for case of two or more slashes in a path.
224      */
225     final static Pattern   MULTIPLE_SLASHES = Pattern.compile("//+");
226     
227     /**
228      * System property key for list of supported schemes.
229      */
230     private static final String   SCHEMES_KEY = ".schemes";
231     
232     /**
233      * System property key for list of purposefully-ignored schemes.
234      */
235     private static final String   IGNORED_SCHEMES_KEY = ".ignored-schemes";
236 
237     private String  [] schemes = null;
238     private String  [] ignoredSchemes = null;
239 
240     public static final int IGNORED_SCHEME = 9999999;
241     
242     /**
243      * Protected constructor.
244      */
245     private UURIFactory() {
246         super();
247         String   s = System.getProperty(this.getClass().getName() + SCHEMES_KEY);
248         if (s != null && s.length() > 0) {
249             schemes = s.split("[, ]+");
250             Arrays.sort(schemes);
251         }
252         String   ignored = System.getProperty(this.getClass().getName() + IGNORED_SCHEMES_KEY);
253         if (ignored != null && ignored.length() > 0) {
254             ignoredSchemes  = ignored.split("[, ]+");
255             Arrays.sort(ignoredSchemes);
256         }
257     }
258     
259     /**
260      * @param uri URI as string.
261      * @return An instance of UURI
262      * @throws URIException
263      */
264     public static UURI getInstance(String   uri) throws URIException {
265         return UURIFactory.factory.create(uri);
266     }
267     
268     /**
269      * @param uri URI as string.
270      * @param charset Character encoding of the passed uri string.
271      * @return An instance of UURI
272      * @throws URIException
273      */
274     public static UURI getInstance(String   uri, String   charset)
275             throws URIException {
276         return UURIFactory.factory.create(uri, charset);
277     }
278     
279     /**
280      * @param base Base uri to use resolving passed relative uri.
281      * @param relative URI as string.
282      * @return An instance of UURI
283      * @throws URIException
284      */
285     public static UURI getInstance(UURI base, String   relative)
286             throws URIException {
287         return UURIFactory.factory.create(base, relative);
288     }
289     
290     /**
291      * Test of whether passed String has an allowed URI scheme.
292      * First tests if likely scheme suffix.  If so, we then test if its one of
293      * the supported schemes.
294      * @param possibleUrl URL string to examine.
295      * @return True if passed string looks like it could be an URL.
296      */
297     public static boolean hasSupportedScheme(String   possibleUrl) {
298         boolean hasScheme = UURI.hasScheme(possibleUrl);
299         if (!hasScheme || UURIFactory.factory.schemes == null) {
300             return hasScheme;
301         }
302         String   tmpStr = possibleUrl.substring(0, possibleUrl.indexOf(':'));
303         return Arrays.binarySearch(UURIFactory.factory.schemes, tmpStr) >= 0;
304     }
305 
306     /**
307      * @param uri URI as string.
308      * @return Instance of UURI.
309      * @throws URIException
310      */
311     private UURI create(String   uri) throws URIException {
312         return create(uri, UURI.getDefaultProtocolCharset());
313     }
314     
315     /**
316      * @param uri URI as string.
317      * @param charset Original encoding of the string.
318      * @return Instance of UURI.
319      * @throws URIException
320      */
321     private UURI create(String   uri, String   charset) throws URIException {
322         UURI uuri  = new UURI(fixup(uri, null, charset), true, charset);
323         if (logger.isLoggable(Level.FINE)) {
324             logger.fine("URI " + uri +
325                 " PRODUCT " + uuri.toString() +
326                 " CHARSET " + charset);
327         }
328         return validityCheck(uuri);
329     }
330     
331     /**
332      * @param base UURI to use as a base resolving <code>relative</code>.
333      * @param relative Relative URI.
334      * @return Instance of UURI.
335      * @throws URIException
336      */
337     private UURI create(UURI base, String   relative) throws URIException {
338         UURI uuri = new UURI(base, new UURI(fixup(relative, base, base.getProtocolCharset()),
339             true, base.getProtocolCharset()));
340         if (logger.isLoggable(Level.FINE)) {
341             logger.fine(" URI " + relative +
342                 " PRODUCT " + uuri.toString() +
343                 " CHARSET " + base.getProtocolCharset() +
344                 " BASE " + base);
345         }
346         return validityCheck(uuri);
347     }
348 
349     /**
350      * Check the generated UURI.
351      * 
352      * At the least look at length of uuri string.  We were seeing case
353      * where before escaping, string was &lt; MAX_URL_LENGTH but after was
354      * &gt;.  Letting out a too-big message was causing us troubles later
355      * down the processing chain.
356      * @param uuri Created uuri to check.
357      * @return The passed <code>uuri</code> so can easily inline this check.
358      * @throws URIException
359      */
360     protected UURI validityCheck(UURI uuri) throws URIException {
361         if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) {
362            throw new URIException("Created (escaped) uuri > " +
363               UURI.MAX_URL_LENGTH +": "+uuri.toString());
364         }
365         return uuri;
366     }
367     
368     /**
369      * Do heritrix fix-up on passed uri string.
370      *
371      * Does heritrix escaping; usually escaping done to make our behavior align
372      * with IEs.  This method codifies our experience pulling URIs from the
373      * wilds.  Its does all the escaping we want; its output can always be
374      * assumed to be 'escaped' (though perhaps to a laxer standard than the 
375      * vanilla HttpClient URI class or official specs might suggest). 
376      *
377      * @param uri URI as string.
378      * @param base May be null.
379      * @param e True if the uri is already escaped.
380      * @return A fixed up URI string.
381      * @throws URIException
382      */
383     private String   fixup(String   uri, final URI base, final String   charset)
384     throws URIException {
385         if (uri == null) {
386             throw new NullPointerException  ();
387         } else if (uri.length() == 0 && base == null) {
388             throw new URIException("URI length is zero (and not relative).");
389         }
390         
391         if (uri.length() > UURI.MAX_URL_LENGTH) {
392             // We check length here and again later after all convertions.
393             throw new URIException("URI length > " + UURI.MAX_URL_LENGTH +
394                 ": " + uri);
395         }
396         
397         // Replace nbsp with normal spaces (so that they get stripped if at
398         // ends, or encoded if in middle)
399         if (uri.indexOf(NBSP) >= 0) {
400             uri = TextUtils.replaceAll(NBSP, uri, SPACE);
401         }
402         
403         // Get rid of any trailing spaces or new-lines. 
404         uri = uri.trim();
405         
406         // IE actually converts backslashes to slashes rather than to %5C.
407         // Since URIs that have backslashes usually work only with IE, we will
408         // convert backslashes to slashes as well.
409         // TODO: Maybe we can first convert backslashes by specs and than by IE
410         // so that we fetch both versions.
411         if (uri.indexOf(BACKSLASH) >= 0) {
412             uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH);
413         }
414         
415         // Kill newlines etc
416         uri = TextUtils.replaceAll(NEWLINE, uri, EMPTY_STRING);
417         
418         // Test for the case of more than two slashes after the http(s) scheme.
419         // Replace with two slashes as mozilla does if found.
420         // See [ 788219 ] URI Syntax Errors stop page parsing.
421         Matcher   matcher = HTTP_SCHEME_SLASHES.matcher(uri);
422         if (matcher.matches()) {
423             uri = matcher.group(1) + matcher.group(2);
424         }
425 
426         // now, minimally escape any whitespace
427         uri = escapeWhitespace(uri);
428         
429         // For further processing, get uri elements.  See the RFC2396REGEX
430         // comment above for explaination of group indices used in the below.
431         matcher = RFC2396REGEX.matcher(uri);
432         if (!matcher.matches()) {
433             throw new URIException("Failed parse of " + uri);
434         }
435         String   uriScheme = checkUriElementAndLowerCase(matcher.group(2));
436         String   uriSchemeSpecificPart = checkUriElement(matcher.group(3));
437         String   uriAuthority = checkUriElement(matcher.group(5));
438         String   uriPath = checkUriElement(matcher.group(6));
439         String   uriQuery = checkUriElement(matcher.group(8));
440         // UNUSED String uriFragment = checkUriElement(matcher.group(10));
441         
442         // If a scheme, is it a supported scheme?
443         if (uriScheme != null && uriScheme.length() > 0 &&
444                 this.schemes != null) {
445             if (!(Arrays.binarySearch(schemes,uriScheme)>=0)) {
446                 // unsupported; see if silently ignored
447                 if((Arrays.binarySearch(ignoredSchemes,uriScheme)>=0)) {
448                     throw new URIException(
449                             IGNORED_SCHEME, "Ignored scheme: " + uriScheme);
450                 } else {
451                     throw new URIException("Unsupported scheme: " + uriScheme);
452                 }
453             }
454         }
455         
456         // Test if relative URI. If so, need a base to resolve against.
457         if (uriScheme == null || uriScheme.length() <= 0) {
458             if (base == null) {
459                 throw new URIException("Relative URI but no base: " + uri);
460             }
461         } else {
462             checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme,
463                 uriSchemeSpecificPart);
464         }
465         
466         // fixup authority portion: lowercase/IDN-punycode any domain; 
467         // remove stray trailing spaces
468         uriAuthority = fixupAuthority(uriAuthority);
469 
470         // Do some checks if absolute path.
471         if (uriSchemeSpecificPart != null &&
472                 uriSchemeSpecificPart.startsWith(SLASH)) {
473             if (uriPath != null) {
474                 // Eliminate '..' if its first thing in the path.  IE does this.
475                 uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath,
476                     SLASH);
477             }
478             // Ensure root URLs end with '/': browsers always send "/"
479             // on the request-line, so we should consider "http://host"
480             // to be "http://host/".
481             if (uriPath == null || EMPTY_STRING.equals(uriPath)) {
482                 uriPath = SLASH;
483             }
484         }
485 
486         if (uriAuthority != null) {
487             if (uriScheme != null && uriScheme.length() > 0 &&
488                     uriScheme.equals(HTTP)) {
489                 uriAuthority = checkPort(uriAuthority);
490                 uriAuthority = stripTail(uriAuthority, HTTP_PORT);
491             } else if (uriScheme != null && uriScheme.length() > 0 &&
492                     uriScheme.equals(HTTPS)) {
493                 uriAuthority = checkPort(uriAuthority);
494                 uriAuthority = stripTail(uriAuthority, HTTPS_PORT);
495             }
496             // Strip any prefix dot or tail dots from the authority.
497             uriAuthority = stripTail(uriAuthority, DOT);
498             uriAuthority = stripPrefix(uriAuthority, DOT);
499         } else {
500             // no authority; may be relative. consider stripping scheme
501             // to work-around org.apache.commons.httpclient.URI bug
502             // ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 )
503             if (uriScheme != null && base != null
504                     && uriScheme.equals(base.getScheme())) {
505                 // uriScheme redundant and will only confound httpclient.URI
506                 uriScheme = null; 
507             }
508         }
509         
510         // Ensure minimal escaping. Use of 'lax' URI and URLCodec 
511         // means minimal escaping isn't necessarily complete/consistent.
512         // There is a chance such lax encoding will throw exceptions
513         // later at inconvenient times. 
514         //
515         // One reason for these bad escapings -- though not the only --
516         // is that the page is using an encoding other than the ASCII or the
517         // UTF-8 that is our default URI encoding.  In this case the parent
518         // class is burping on the passed URL encoding.  If the page encoding
519         // was passed into this factory, the encoding seems to be parsed
520         // correctly (See the testEscapedEncoding unit test).
521         //
522         // This fixup may cause us to miss content.  There is the charset case
523         // noted above.  TODO: Look out for cases where we fail other than for
524         // the above given reason which will be fixed when we address
525         // '[ 913687 ] Make extractors interrogate for charset'.
526 
527         uriPath = ensureMinimalEscaping(uriPath, charset);
528         uriQuery = ensureMinimalEscaping(uriQuery, charset,
529             LaxURLCodec.QUERY_SAFE);
530 
531         // Preallocate.  The '1's and '2's in below are space for ':',
532         // '//', etc. URI characters.
533         MutableString s = new MutableString(
534             ((uriScheme != null)? uriScheme.length(): 0)
535             + 1 // ';' 
536             + ((uriAuthority != null)? uriAuthority.length(): 0)
537             + 2 // '//'
538             + ((uriPath != null)? uriPath.length(): 0)
539             + 1 // '?'
540             + ((uriQuery != null)? uriQuery.length(): 0));
541         appendNonNull(s, uriScheme, ":", true);
542         appendNonNull(s, uriAuthority, "//", false);
543         appendNonNull(s, uriPath, "", false);
544         appendNonNull(s, uriQuery, "?", false);
545         return s.toString();
546     }
547     
548     /**
549      * If http(s) scheme, check scheme specific part begins '//'.
550      * @throws URIException 
551      * @see http://www.faqs.org/rfcs/rfc1738.html Section 3.1. Common Internet
552      * Scheme Syntax
553      */
554     protected void checkHttpSchemeSpecificPartSlashPrefix(final URI base,
555             final String   scheme, final String   schemeSpecificPart)
556     throws URIException {
557         // Only apply this check if no base.
558         if (base != null) {
559             return;
560         }
561         if (scheme == null || scheme.length() <= 0) {
562             return;
563         }
564         if (!scheme.equals("http") && !scheme.equals("https")) {
565             return;
566         }
567         if (!schemeSpecificPart.startsWith("//")) {
568             throw new URIException("http scheme specific part must " +
569                 "begin '//': " + schemeSpecificPart);
570         }
571         if (schemeSpecificPart.length() <= 2) {
572             throw new URIException("http scheme specific part is " +
573                 "too short: " + schemeSpecificPart);
574         }
575     }
576     
577     /**
578      * Fixup 'authority' portion of URI, by removing any stray 
579      * encoded spaces, lowercasing any domain names, and applying
580      * IDN-punycoding to Unicode domains. 
581      * 
582      * @param uriAuthority the authority string to fix
583      * @return fixed version
584      * @throws URIException
585      */
586     private String   fixupAuthority(String   uriAuthority) throws URIException {
587         // Lowercase the host part of the uriAuthority; don't destroy any
588         // userinfo capitalizations.  Make sure no illegal characters in
589         // domainlabel substring of the uri authority.
590         if (uriAuthority != null) {
591             // Get rid of any trailing escaped spaces:
592             // http://www.archive.org%20.  Rare but happens.
593             // TODO: reevaluate: do IE or firefox do such mid-URI space-removal?
594             // if not, we shouldn't either. 
595             while(uriAuthority.endsWith(ESCAPED_SPACE)) {
596                 uriAuthority = uriAuthority.substring(0,uriAuthority.length()-3);
597             }
598 
599             // lowercase & IDN-punycode only the domain portion
600             int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
601             int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex);
602             if(atIndex<0 && portColonIndex<0) {
603                 // most common case: neither userinfo nor port
604                 return fixupDomainlabel(uriAuthority);
605             } else if (atIndex<0 && portColonIndex>-1) {
606                 // next most common: port but no userinfo
607                 String   domain = fixupDomainlabel(uriAuthority.substring(0,portColonIndex));
608                 String   port = uriAuthority.substring(portColonIndex);
609                 return domain + port;
610             } else if (atIndex>-1 && portColonIndex<0) {
611                 // uncommon: userinfo, no port
612                 String   userinfo = uriAuthority.substring(0,atIndex+1);
613                 String   domain = fixupDomainlabel(uriAuthority.substring(atIndex+1));
614                 return userinfo + domain;
615             } else {
616                 // uncommon: userinfo, port
617                 String   userinfo = uriAuthority.substring(0,atIndex+1);
618                 String   domain = fixupDomainlabel(uriAuthority.substring(atIndex+1,portColonIndex));
619                 String   port = uriAuthority.substring(portColonIndex);
620                 return userinfo + domain + port;
621             }
622         }
623         return uriAuthority;
624     }
625     
626     /**
627      * Fixup the domain label part of the authority.
628      * 
629      * We're more lax than the spec. in that we allow underscores.
630      * 
631      * @param label Domain label to fix.
632      * @return Return fixed domain label.
633      * @throws URIException
634      */
635     private String   fixupDomainlabel(String   label)
636     throws URIException {
637         
638         // apply IDN-punycoding, as necessary
639         try {
640             // TODO: optimize: only apply when necessary, or
641             // keep cache of recent encodings
642             label = IDNA.toASCII(label);
643         } catch (IDNAException e) {
644             if(TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN,label)) {
645                 // domain name has ACE prefix, leading/trailing dash, or 
646                 // underscore -- but is still a name we wish to tolerate;
647                 // simply continue
648             } else {
649                 // problematic domain: neither ASCII acceptable characters
650                 // nor IDN-punycodable, so throw exception 
651                 // TODO: change to HeritrixURIException so distinguishable
652                 // from URIExceptions in library code
653                 URIException ue = new URIException(e+" "+label);
654                 ue.initCause(e);
655                 throw ue;
656             }
657         }
658         label = label.toLowerCase();
659         return label;
660     }
661     
662     /**
663      * Ensure that there all characters needing escaping
664      * in the passed-in String are escaped. Stray '%' characters
665      * are *not* escaped, as per browser behavior. 
666      * 
667      * @param u String to escape
668      * @param charset 
669      * @return string with any necessary escaping applied
670      */
671     private String   ensureMinimalEscaping(String   u, final String   charset) {
672         return ensureMinimalEscaping(u, charset, LaxURLCodec.EXPANDED_URI_SAFE);
673     }
674     
675     /**
676      * Ensure that there all characters needing escaping
677      * in the passed-in String are escaped. Stray '%' characters
678      * are *not* escaped, as per browser behavior. 
679      * 
680      * @param u String to escape
681      * @param charset 
682      * @param bitset 
683      * @return string with any necessary escaping applied
684      */
685     private String   ensureMinimalEscaping(String   u, final String   charset,
686             final BitSet   bitset) {
687         if (u == null) {
688             return null;
689         }
690         for (int i = 0; i < u.length(); i++) {
691             char c = u.charAt(i);
692             if (!bitset.get(c)) {
693                 try {
694                     u = LaxURLCodec.DEFAULT.encode(bitset, u, charset);
695                 } catch (UnsupportedEncodingException   e) {
696                     e.printStackTrace();
697                 }
698                 break;
699             }
700         }
701         return u;
702     }
703 
704     /**
705      * Escape any whitespace found.
706      * 
707      * The parent class takes care of the bulk of escaping.  But if any
708      * instance of escaping is found in the URI, then we ask for parent
709      * to do NO escaping.  Here we escape any whitespace found irrespective
710      * of whether the uri has already been escaped.  We do this for
711      * case where uri has been judged already-escaped only, its been
712      * incompletly done and whitespace remains.  Spaces, etc., in the URI are
713      * a real pain.  Their presence will break log file and ARC parsing.
714      * @param uri URI string to check.
715      * @return uri with spaces escaped if any found.
716      */
717     protected String   escapeWhitespace(String   uri) {
718         // Just write a new string anyways.  The perl '\s' is not
719         // as inclusive as the Character.isWhitespace so there are
720         // whitespace characters we could miss.  So, rather than
721         // write some awkward regex, just go through the string
722         // a character at a time.  Only create buffer first time
723         // we find a space.
724         MutableString buffer = null;
725         for (int i = 0; i < uri.length(); i++) {
726             char c = uri.charAt(i);
727             if (Character.isWhitespace(c)) {
728                 if (buffer == null) {
729                     buffer = new MutableString(uri.length() +
730                         2 /*If space, two extra characters (at least)*/);
731                     buffer.append(uri.substring(0, i));
732                 }
733                 buffer.append("%");
734                 String   hexStr = Integer.toHexString(c);
735                 if ((hexStr.length() % 2) > 0) {
736                     buffer.append("0");
737                 }
738                 buffer.append(hexStr);
739                 
740             } else {
741                 if (buffer != null) {
742                     buffer.append(c);
743                 }
744             }
745         }
746         return (buffer !=  null)? buffer.toString(): uri;
747     }
748 
749     /**
750      * Check port on passed http authority.  Make sure the size is not larger
751      * than allowed: See the 'port' definition on this
752      * page, http://www.kerio.com/manual/wrp/en/418.htm.
753      * Also, we've seen port numbers of '0080' whose leading zeros confuse
754      * the parent class. Strip the leading zeros.
755      *
756      * @param uriAuthority
757      * @return Null or an amended port number.
758      * @throws URIException
759      */
760     private String   checkPort(String   uriAuthority)
761     throws URIException {
762         Matcher   m = PORTREGEX.matcher(uriAuthority);
763         if (m.matches()) {
764             String   no = m.group(2);
765             if (no != null && no.length() > 0) {
766                 // First check if the port has leading zeros
767                 // as in '0080'.  Strip them if it has and
768                 // then reconstitute the uriAuthority.  Be careful
769                 // of cases where port is '0' or '000'.
770                 while (no.charAt(0) == '0' && no.length() > 1) {
771                     no = no.substring(1);
772                 }
773                 uriAuthority = m.group(1) + no;
774                 // Now makesure the number is legit.
775                 int portNo = Integer.parseInt(no);
776                 if (portNo <= 0 || portNo > 65535) {
777                     throw new URIException("Port out of bounds: " +
778                         uriAuthority);
779                 }
780             }
781         }
782         return uriAuthority;
783     }
784 
785     /**
786      * @param b Buffer to append to.
787      * @param str String to append if not null.
788      * @param substr Suffix or prefix to use if <code>str</code> is not null.
789      * @param suffix True if <code>substr</code> is a suffix.
790      */
791     private void appendNonNull(MutableString b, String   str, String   substr,
792             boolean suffix) {
793         if (str != null && str.length() > 0) {
794             if (!suffix) {
795                 b.append(substr);
796             }
797             b.append(str);
798             if (suffix) {
799                 b.append(substr);
800             }
801         }
802     }
803 
804     /**
805      * @param str String to work on.
806      * @param prefix Prefix to strip if present.
807      * @return <code>str</code> w/o <code>prefix</code>.
808      */
809     private String   stripPrefix(String   str, String   prefix) {
810         return str.startsWith(prefix)?
811             str.substring(prefix.length(), str.length()):
812             str;
813     }
814 
815     /**
816      * @param str String to work on.
817      * @param tail Tail to strip if present.
818      * @return <code>str</code> w/o <code>tail</code>.
819      */
820     private static String   stripTail(String   str, String   tail) {
821         return str.endsWith(tail)?
822             str.substring(0, str.length() - tail.length()):
823             str;
824     }
825 
826     /**
827      * @param element to examine.
828      * @return Null if passed null or an empty string otherwise
829      * <code>element</code>.
830      */
831     private String   checkUriElement(String   element) {
832         return (element == null || element.length() <= 0)? null: element;
833     }
834 
835     /**
836      * @param element to examine and lowercase if non-null.
837      * @return Null if passed null or an empty string otherwise
838      * <code>element</code> lowercased.
839      */
840     private String   checkUriElementAndLowerCase(String   element) {
841         String   tmp = checkUriElement(element);
842         return (tmp != null)? tmp.toLowerCase(): tmp;
843     }
844 }
845
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags