KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > net > BasicUrlNormalizer


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.net;
5
6 import java.net.URL JavaDoc;
7 import java.net.MalformedURLException JavaDoc;
8 // import java.net.URI;
9
// import java.net.URISyntaxException;
10

11 import java.util.logging.Logger JavaDoc;
12 import net.nutch.util.LogFormatter;
13 import org.apache.oro.text.regex.*;
14
15 /** Converts URLs to a normal form . */
16 public class BasicUrlNormalizer implements UrlNormalizer {
17     public static final Logger JavaDoc LOG =
18             LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer");
19
20     private Perl5Compiler compiler = new Perl5Compiler();
21     private ThreadLocal JavaDoc matchers = new ThreadLocal JavaDoc() {
22         protected synchronized Object JavaDoc initialValue() {
23           return new Perl5Matcher();
24         }
25       };
26     private Rule relativePathRule = null;
27     private Rule leadingRelativePathRule = null;
28
29     public BasicUrlNormalizer() {
30       try {
31         // this pattern tries to find spots like "/xx/../" in the url, which
32
// could be replaced by "/" xx consists of chars, different then "/"
33
// (slash) and needs to have at least one char different from "."
34
relativePathRule = new Rule();
35         relativePathRule.pattern = (Perl5Pattern)
36           compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)",
37                            Perl5Compiler.READ_ONLY_MASK);
38         relativePathRule.substitution = new Perl5Substitution("/");
39
40         // this pattern tries to find spots like leading "/../" in the url,
41
// which could be replaced by "/"
42
leadingRelativePathRule = new Rule();
43         leadingRelativePathRule.pattern = (Perl5Pattern)
44           compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
45         leadingRelativePathRule.substitution = new Perl5Substitution("/");
46
47       } catch (MalformedPatternException e) {
48         e.printStackTrace();
49         throw new RuntimeException JavaDoc(e);
50       }
51     }
52
53     public String JavaDoc normalize(String JavaDoc urlString)
54             throws MalformedURLException JavaDoc {
55         if ("".equals(urlString)) // permit empty
56
return urlString;
57
58         urlString = urlString.trim(); // remove extra spaces
59

60         URL JavaDoc url = new URL JavaDoc(urlString);
61
62         String JavaDoc protocol = url.getProtocol();
63         String JavaDoc host = url.getHost();
64         int port = url.getPort();
65         String JavaDoc file = url.getFile();
66
67         boolean changed = false;
68
69         if (!urlString.startsWith(protocol)) // protocol was lowercased
70
changed = true;
71
72         if ("http".equals(protocol) || "ftp".equals(protocol)) {
73
74             if (host != null) {
75                 String JavaDoc newHost = host.toLowerCase(); // lowercase host
76
if (!host.equals(newHost)) {
77                     host = newHost;
78                     changed = true;
79                 }
80             }
81
82             if (port == url.getDefaultPort()) { // uses default port
83
port = -1; // so don't specify it
84
changed = true;
85             }
86
87             if (file == null || "".equals(file)) { // add a slash
88
file = "/";
89                 changed = true;
90             }
91
92             if (url.getRef() != null) { // remove the ref
93
changed = true;
94             }
95
96             // check for unnecessary use of "/../"
97
String JavaDoc file2 = substituteUnnecessaryRelativePaths(file);
98
99             if (!file.equals(file2)) {
100                 changed = true;
101                 file = file2;
102             }
103
104         }
105
106         if (changed)
107             urlString = new URL JavaDoc(protocol, host, port, file).toString();
108
109         return urlString;
110     }
111
112     private String JavaDoc substituteUnnecessaryRelativePaths(String JavaDoc file) {
113         String JavaDoc fileWorkCopy = file;
114         int oldLen = file.length();
115         int newLen = oldLen - 1;
116
117         // All substitutions will be done step by step, to ensure that certain
118
// constellations will be normalized, too
119
//
120
// For example: "/aa/bb/../../cc/../foo.html will be normalized in the
121
// following manner:
122
// "/aa/bb/../../cc/../foo.html"
123
// "/aa/../cc/../foo.html"
124
// "/cc/../foo.html"
125
// "/foo.html"
126
//
127
// The normalization also takes care of leading "/../", which will be
128
// replaced by "/", because this is a rather a sign of bad webserver
129
// configuration than of a wanted link. For example, urls like
130
// "http://www.foo.com/../" should return a http 404 error instead of
131
// redirecting to "http://www.foo.com".
132
//
133
Perl5Matcher matcher = (Perl5Matcher)matchers.get();
134
135         while (oldLen != newLen) {
136             // substitue first occurence of "/xx/../" by "/"
137
oldLen = fileWorkCopy.length();
138             fileWorkCopy = Util.substitute
139               (matcher, relativePathRule.pattern,
140                relativePathRule.substitution, fileWorkCopy, 1);
141
142             // remove leading "/../"
143
fileWorkCopy = Util.substitute
144               (matcher, leadingRelativePathRule.pattern,
145                leadingRelativePathRule.substitution, fileWorkCopy, 1);
146             newLen = fileWorkCopy.length();
147         }
148
149         return fileWorkCopy;
150     }
151
152
153     /**
154      * Class which holds a compiled pattern and its corresponding substition
155      * string.
156      */

157     private static class Rule {
158         public Perl5Pattern pattern;
159         public Perl5Substitution substitution;
160     }
161
162 }
163
164
Popular Tags