1 25 package org.archive.net; 26 27 import gnu.inet.encoding.IDNA; 28 import gnu.inet.encoding.IDNAException; 29 import it.unimi.dsi.mg4j.util.MutableString; 30 31 import java.io.UnsupportedEncodingException ; 32 import java.util.Arrays ; 33 import java.util.BitSet ; 34 import java.util.logging.Level ; 35 import java.util.logging.Logger ; 36 import java.util.regex.Matcher ; 37 import java.util.regex.Pattern ; 38 39 import org.apache.commons.httpclient.URI; 40 import org.apache.commons.httpclient.URIException; 41 import org.archive.util.TextUtils; 42 43 44 58 public class UURIFactory extends URI { 59 60 private static final long serialVersionUID = -6146295130382209042L; 61 62 65 private static Logger logger = 66 Logger.getLogger(UURIFactory.class.getName()); 67 68 71 private static final UURIFactory factory = new UURIFactory(); 72 73 130 final static Pattern RFC2396REGEX = Pattern.compile( 131 "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?"); 132 145 public static final String SLASHDOTDOTSLASH = "^(/\\.\\./)+"; 146 public static final String SLASH = "/"; 147 public static final String HTTP = "http"; 148 public static final String HTTP_PORT = ":80"; 149 public static final String HTTPS = "https"; 150 public static final String HTTPS_PORT = ":443"; 151 public static final String DOT = "."; 152 public static final String EMPTY_STRING = ""; 153 public static final String NBSP = "\u00A0"; 154 public static final String SPACE = " "; 155 public static final String ESCAPED_SPACE = "%20"; 156 public static final String TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$"; 157 public static final String PIPE = "|"; 158 public static final String PIPE_PATTERN = "\\|"; 159 public static final String ESCAPED_PIPE = "%7C"; 160 public static final String CIRCUMFLEX = "^"; 161 public static final String CIRCUMFLEX_PATTERN = "\\^"; 162 public static final String ESCAPED_CIRCUMFLEX = "%5E"; 163 public static final String QUOT = "\""; 164 public static final String ESCAPED_QUOT = "%22"; 165 public static final String SQUOT = "'"; 166 public static final String ESCAPED_SQUOT = "%27"; 167 public static final String APOSTROPH = "`"; 168 public static final String ESCAPED_APOSTROPH = "%60"; 169 public static final String LSQRBRACKET = "["; 170 public static final String LSQRBRACKET_PATTERN = "\\["; 171 public static final String ESCAPED_LSQRBRACKET = "%5B"; 172 public static final String RSQRBRACKET = "]"; 173 public static final String RSQRBRACKET_PATTERN = "\\]"; 174 public static final String ESCAPED_RSQRBRACKET = "%5D"; 175 public static final String LCURBRACKET = "{"; 176 public static final String LCURBRACKET_PATTERN = "\\{"; 177 public static final String ESCAPED_LCURBRACKET = "%7B"; 178 public static final String RCURBRACKET = "}"; 179 public static final String RCURBRACKET_PATTERN = "\\}"; 180 public static final String ESCAPED_RCURBRACKET = "%7D"; 181 public static final String BACKSLASH = "\\"; 182 public static final String BACKSLASH_PATTERN = "\\\\"; 183 public static final String ESCAPED_BACKSLASH = "%5C"; 184 public static final String NEWLINE = "\n+|\r+"; 185 public static final String IMPROPERESC_REPLACE = "%25$1"; 186 public static final String IMPROPERESC = 187 "%((?:[^\\p{XDigit}])|(?:.[^\\p{XDigit}])|(?:\\z))"; 188 public static final String COMMERCIAL_AT = "@"; 189 public static final char PERCENT_SIGN = '%'; 190 public static final char COLON = ':'; 191 192 195 public static final String URI_HEX_ENCODING = 196 "^[^%]*%[\\p{XDigit}][\\p{XDigit}].*"; 197 198 201 final static Pattern PORTREGEX = Pattern.compile("(.*:)([0-9]+)$"); 202 203 212 final static String ACCEPTABLE_ASCII_DOMAIN = 213 "^(?:[a-zA-Z0-9_-]++(?:\\.)?)++$"; 214 215 219 final static Pattern HTTP_SCHEME_SLASHES = 220 Pattern.compile("^(https?://)/+(.*)"); 221 222 225 final static Pattern MULTIPLE_SLASHES = Pattern.compile("//+"); 226 227 230 private static final String SCHEMES_KEY = ".schemes"; 231 232 235 private static final String IGNORED_SCHEMES_KEY = ".ignored-schemes"; 236 237 private String [] schemes = null; 238 private String [] ignoredSchemes = null; 239 240 public static final int IGNORED_SCHEME = 9999999; 241 242 245 private UURIFactory() { 246 super(); 247 String s = System.getProperty(this.getClass().getName() + SCHEMES_KEY); 248 if (s != null && s.length() > 0) { 249 schemes = s.split("[, ]+"); 250 Arrays.sort(schemes); 251 } 252 String ignored = System.getProperty(this.getClass().getName() + IGNORED_SCHEMES_KEY); 253 if (ignored != null && ignored.length() > 0) { 254 ignoredSchemes = ignored.split("[, ]+"); 255 Arrays.sort(ignoredSchemes); 256 } 257 } 258 259 264 public static UURI getInstance(String uri) throws URIException { 265 return UURIFactory.factory.create(uri); 266 } 267 268 274 public static UURI getInstance(String uri, String charset) 275 throws URIException { 276 return UURIFactory.factory.create(uri, charset); 277 } 278 279 285 public static UURI getInstance(UURI base, String relative) 286 throws URIException { 287 return UURIFactory.factory.create(base, relative); 288 } 289 290 297 public static boolean hasSupportedScheme(String possibleUrl) { 298 boolean hasScheme = UURI.hasScheme(possibleUrl); 299 if (!hasScheme || UURIFactory.factory.schemes == null) { 300 return hasScheme; 301 } 302 String tmpStr = possibleUrl.substring(0, possibleUrl.indexOf(':')); 303 return Arrays.binarySearch(UURIFactory.factory.schemes, tmpStr) >= 0; 304 } 305 306 311 private UURI create(String uri) throws URIException { 312 return create(uri, UURI.getDefaultProtocolCharset()); 313 } 314 315 321 private UURI create(String uri, String charset) throws URIException { 322 UURI uuri = new UURI(fixup(uri, null, charset), true, charset); 323 if (logger.isLoggable(Level.FINE)) { 324 logger.fine("URI " + uri + 325 " PRODUCT " + uuri.toString() + 326 " CHARSET " + charset); 327 } 328 return validityCheck(uuri); 329 } 330 331 337 private UURI create(UURI base, String relative) throws URIException { 338 UURI uuri = new UURI(base, new UURI(fixup(relative, base, base.getProtocolCharset()), 339 true, base.getProtocolCharset())); 340 if (logger.isLoggable(Level.FINE)) { 341 logger.fine(" URI " + relative + 342 " PRODUCT " + uuri.toString() + 343 " CHARSET " + base.getProtocolCharset() + 344 " BASE " + base); 345 } 346 return validityCheck(uuri); 347 } 348 349 360 protected UURI validityCheck(UURI uuri) throws URIException { 361 if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) { 362 throw new URIException("Created (escaped) uuri > " + 363 UURI.MAX_URL_LENGTH +": "+uuri.toString()); 364 } 365 return uuri; 366 } 367 368 383 private String fixup(String uri, final URI base, final String charset) 384 throws URIException { 385 if (uri == null) { 386 throw new NullPointerException (); 387 } else if (uri.length() == 0 && base == null) { 388 throw new URIException("URI length is zero (and not relative)."); 389 } 390 391 if (uri.length() > UURI.MAX_URL_LENGTH) { 392 throw new URIException("URI length > " + UURI.MAX_URL_LENGTH + 394 ": " + uri); 395 } 396 397 if (uri.indexOf(NBSP) >= 0) { 400 uri = TextUtils.replaceAll(NBSP, uri, SPACE); 401 } 402 403 uri = uri.trim(); 405 406 if (uri.indexOf(BACKSLASH) >= 0) { 412 uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH); 413 } 414 415 uri = TextUtils.replaceAll(NEWLINE, uri, EMPTY_STRING); 417 418 Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri); 422 if (matcher.matches()) { 423 uri = matcher.group(1) + matcher.group(2); 424 } 425 426 uri = escapeWhitespace(uri); 428 429 matcher = RFC2396REGEX.matcher(uri); 432 if (!matcher.matches()) { 433 throw new URIException("Failed parse of " + uri); 434 } 435 String uriScheme = checkUriElementAndLowerCase(matcher.group(2)); 436 String uriSchemeSpecificPart = checkUriElement(matcher.group(3)); 437 String uriAuthority = checkUriElement(matcher.group(5)); 438 String uriPath = checkUriElement(matcher.group(6)); 439 String uriQuery = checkUriElement(matcher.group(8)); 440 442 if (uriScheme != null && uriScheme.length() > 0 && 444 this.schemes != null) { 445 if (!(Arrays.binarySearch(schemes,uriScheme)>=0)) { 446 if((Arrays.binarySearch(ignoredSchemes,uriScheme)>=0)) { 448 throw new URIException( 449 IGNORED_SCHEME, "Ignored scheme: " + uriScheme); 450 } else { 451 throw new URIException("Unsupported scheme: " + uriScheme); 452 } 453 } 454 } 455 456 if (uriScheme == null || uriScheme.length() <= 0) { 458 if (base == null) { 459 throw new URIException("Relative URI but no base: " + uri); 460 } 461 } else { 462 checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme, 463 uriSchemeSpecificPart); 464 } 465 466 uriAuthority = fixupAuthority(uriAuthority); 469 470 if (uriSchemeSpecificPart != null && 472 uriSchemeSpecificPart.startsWith(SLASH)) { 473 if (uriPath != null) { 474 uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath, 476 SLASH); 477 } 478 if (uriPath == null || EMPTY_STRING.equals(uriPath)) { 482 uriPath = SLASH; 483 } 484 } 485 486 if (uriAuthority != null) { 487 if (uriScheme != null && uriScheme.length() > 0 && 488 uriScheme.equals(HTTP)) { 489 uriAuthority = checkPort(uriAuthority); 490 uriAuthority = stripTail(uriAuthority, HTTP_PORT); 491 } else if (uriScheme != null && uriScheme.length() > 0 && 492 uriScheme.equals(HTTPS)) { 493 uriAuthority = checkPort(uriAuthority); 494 uriAuthority = stripTail(uriAuthority, HTTPS_PORT); 495 } 496 uriAuthority = stripTail(uriAuthority, DOT); 498 uriAuthority = stripPrefix(uriAuthority, DOT); 499 } else { 500 if (uriScheme != null && base != null 504 && uriScheme.equals(base.getScheme())) { 505 uriScheme = null; 507 } 508 } 509 510 527 uriPath = ensureMinimalEscaping(uriPath, charset); 528 uriQuery = ensureMinimalEscaping(uriQuery, charset, 529 LaxURLCodec.QUERY_SAFE); 530 531 MutableString s = new MutableString( 534 ((uriScheme != null)? uriScheme.length(): 0) 535 + 1 + ((uriAuthority != null)? uriAuthority.length(): 0) 537 + 2 + ((uriPath != null)? uriPath.length(): 0) 539 + 1 + ((uriQuery != null)? uriQuery.length(): 0)); 541 appendNonNull(s, uriScheme, ":", true); 542 appendNonNull(s, uriAuthority, "//", false); 543 appendNonNull(s, uriPath, "", false); 544 appendNonNull(s, uriQuery, "?", false); 545 return s.toString(); 546 } 547 548 554 protected void checkHttpSchemeSpecificPartSlashPrefix(final URI base, 555 final String scheme, final String schemeSpecificPart) 556 throws URIException { 557 if (base != null) { 559 return; 560 } 561 if (scheme == null || scheme.length() <= 0) { 562 return; 563 } 564 if (!scheme.equals("http") && !scheme.equals("https")) { 565 return; 566 } 567 if (!schemeSpecificPart.startsWith("//")) { 568 throw new URIException("http scheme specific part must " + 569 "begin '//': " + schemeSpecificPart); 570 } 571 if (schemeSpecificPart.length() <= 2) { 572 throw new URIException("http scheme specific part is " + 573 "too short: " + schemeSpecificPart); 574 } 575 } 576 577 586 private String fixupAuthority(String uriAuthority) throws URIException { 587 if (uriAuthority != null) { 591 while(uriAuthority.endsWith(ESCAPED_SPACE)) { 596 uriAuthority = uriAuthority.substring(0,uriAuthority.length()-3); 597 } 598 599 int atIndex = uriAuthority.indexOf(COMMERCIAL_AT); 601 int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex); 602 if(atIndex<0 && portColonIndex<0) { 603 return fixupDomainlabel(uriAuthority); 605 } else if (atIndex<0 && portColonIndex>-1) { 606 String domain = fixupDomainlabel(uriAuthority.substring(0,portColonIndex)); 608 String port = uriAuthority.substring(portColonIndex); 609 return domain + port; 610 } else if (atIndex>-1 && portColonIndex<0) { 611 String userinfo = uriAuthority.substring(0,atIndex+1); 613 String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1)); 614 return userinfo + domain; 615 } else { 616 String userinfo = uriAuthority.substring(0,atIndex+1); 618 String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1,portColonIndex)); 619 String port = uriAuthority.substring(portColonIndex); 620 return userinfo + domain + port; 621 } 622 } 623 return uriAuthority; 624 } 625 626 635 private String fixupDomainlabel(String label) 636 throws URIException { 637 638 try { 640 label = IDNA.toASCII(label); 643 } catch (IDNAException e) { 644 if(TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN,label)) { 645 } else { 649 URIException ue = new URIException(e+" "+label); 654 ue.initCause(e); 655 throw ue; 656 } 657 } 658 label = label.toLowerCase(); 659 return label; 660 } 661 662 671 private String ensureMinimalEscaping(String u, final String charset) { 672 return ensureMinimalEscaping(u, charset, LaxURLCodec.EXPANDED_URI_SAFE); 673 } 674 675 685 private String ensureMinimalEscaping(String u, final String charset, 686 final BitSet bitset) { 687 if (u == null) { 688 return null; 689 } 690 for (int i = 0; i < u.length(); i++) { 691 char c = u.charAt(i); 692 if (!bitset.get(c)) { 693 try { 694 u = LaxURLCodec.DEFAULT.encode(bitset, u, charset); 695 } catch (UnsupportedEncodingException e) { 696 e.printStackTrace(); 697 } 698 break; 699 } 700 } 701 return u; 702 } 703 704 717 protected String escapeWhitespace(String uri) { 718 MutableString buffer = null; 725 for (int i = 0; i < uri.length(); i++) { 726 char c = uri.charAt(i); 727 if (Character.isWhitespace(c)) { 728 if (buffer == null) { 729 buffer = new MutableString(uri.length() + 730 2 ); 731 buffer.append(uri.substring(0, i)); 732 } 733 buffer.append("%"); 734 String hexStr = Integer.toHexString(c); 735 if ((hexStr.length() % 2) > 0) { 736 buffer.append("0"); 737 } 738 buffer.append(hexStr); 739 740 } else { 741 if (buffer != null) { 742 buffer.append(c); 743 } 744 } 745 } 746 return (buffer != null)? buffer.toString(): uri; 747 } 748 749 760 private String checkPort(String uriAuthority) 761 throws URIException { 762 Matcher m = PORTREGEX.matcher(uriAuthority); 763 if (m.matches()) { 764 String no = m.group(2); 765 if (no != null && no.length() > 0) { 766 while (no.charAt(0) == '0' && no.length() > 1) { 771 no = no.substring(1); 772 } 773 uriAuthority = m.group(1) + no; 774 int portNo = Integer.parseInt(no); 776 if (portNo <= 0 || portNo > 65535) { 777 throw new URIException("Port out of bounds: " + 778 uriAuthority); 779 } 780 } 781 } 782 return uriAuthority; 783 } 784 785 791 private void appendNonNull(MutableString b, String str, String substr, 792 boolean suffix) { 793 if (str != null && str.length() > 0) { 794 if (!suffix) { 795 b.append(substr); 796 } 797 b.append(str); 798 if (suffix) { 799 b.append(substr); 800 } 801 } 802 } 803 804 809 private String stripPrefix(String str, String prefix) { 810 return str.startsWith(prefix)? 811 str.substring(prefix.length(), str.length()): 812 str; 813 } 814 815 820 private static String stripTail(String str, String tail) { 821 return str.endsWith(tail)? 822 str.substring(0, str.length() - tail.length()): 823 str; 824 } 825 826 831 private String checkUriElement(String element) { 832 return (element == null || element.length() <= 0)? null: element; 833 } 834 835 840 private String checkUriElementAndLowerCase(String element) { 841 String tmp = checkUriElement(element); 842 return (tmp != null)? tmp.toLowerCase(): tmp; 843 } 844 } 845 | Popular Tags |