1 2 3 4 package net.nutch.net; 5 6 import java.net.URL ; 7 import java.net.MalformedURLException ; 8 11 import java.util.logging.Logger ; 12 import net.nutch.util.LogFormatter; 13 import org.apache.oro.text.regex.*; 14 15 16 public class BasicUrlNormalizer implements UrlNormalizer { 17 public static final Logger LOG = 18 LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer"); 19 20 private Perl5Compiler compiler = new Perl5Compiler(); 21 private ThreadLocal matchers = new ThreadLocal () { 22 protected synchronized Object initialValue() { 23 return new Perl5Matcher(); 24 } 25 }; 26 private Rule relativePathRule = null; 27 private Rule leadingRelativePathRule = null; 28 29 public BasicUrlNormalizer() { 30 try { 31 relativePathRule = new Rule(); 35 relativePathRule.pattern = (Perl5Pattern) 36 compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)", 37 Perl5Compiler.READ_ONLY_MASK); 38 relativePathRule.substitution = new Perl5Substitution("/"); 39 40 leadingRelativePathRule = new Rule(); 43 leadingRelativePathRule.pattern = (Perl5Pattern) 44 compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK); 45 leadingRelativePathRule.substitution = new Perl5Substitution("/"); 46 47 } catch (MalformedPatternException e) { 48 e.printStackTrace(); 49 throw new RuntimeException (e); 50 } 51 } 52 53 public String normalize(String urlString) 54 throws MalformedURLException { 55 if ("".equals(urlString)) return urlString; 57 58 urlString = urlString.trim(); 60 URL url = new URL (urlString); 61 62 String protocol = url.getProtocol(); 63 String host = url.getHost(); 64 int port = url.getPort(); 65 String file = url.getFile(); 66 67 boolean changed = false; 68 69 if (!urlString.startsWith(protocol)) changed = true; 71 72 if ("http".equals(protocol) || "ftp".equals(protocol)) { 73 74 if (host != null) { 75 String newHost = host.toLowerCase(); if (!host.equals(newHost)) { 77 host = newHost; 78 changed = true; 79 } 80 } 81 82 if (port == url.getDefaultPort()) { port = -1; changed = true; 85 } 86 87 if (file == null || "".equals(file)) { file = "/"; 89 changed = true; 90 } 91 92 if (url.getRef() != null) { changed = true; 94 } 95 96 String file2 = substituteUnnecessaryRelativePaths(file); 98 99 if (!file.equals(file2)) { 100 changed = true; 101 file = file2; 102 } 103 104 } 105 106 if (changed) 107 urlString = new URL (protocol, host, port, file).toString(); 108 109 return urlString; 110 } 111 112 private String substituteUnnecessaryRelativePaths(String file) { 113 String fileWorkCopy = file; 114 int oldLen = file.length(); 115 int newLen = oldLen - 1; 116 117 Perl5Matcher matcher = (Perl5Matcher)matchers.get(); 134 135 while (oldLen != newLen) { 136 oldLen = fileWorkCopy.length(); 138 fileWorkCopy = Util.substitute 139 (matcher, relativePathRule.pattern, 140 relativePathRule.substitution, fileWorkCopy, 1); 141 142 fileWorkCopy = Util.substitute 144 (matcher, leadingRelativePathRule.pattern, 145 leadingRelativePathRule.substitution, fileWorkCopy, 1); 146 newLen = fileWorkCopy.length(); 147 } 148 149 return fileWorkCopy; 150 } 151 152 153 157 private static class Rule { 158 public Perl5Pattern pattern; 159 public Perl5Substitution substitution; 160 } 161 162 } 163 164 | Popular Tags |