KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > javacoding > jspider > core > util > URLUtil


1 package net.javacoding.jspider.core.util;
2
3 import java.net.MalformedURLException JavaDoc;
4 import java.net.URL JavaDoc;
5 import java.util.ArrayList JavaDoc;
6 import java.util.StringTokenizer JavaDoc;
7
8 /**
9  * Some URL related methods gathered as static methods in this utility class.
10  *
11  * $Id: URLUtil.java,v 1.13 2003/04/29 17:53:49 vanrogu Exp $
12  *
13  * @author Günther Van Roey
14  */

15 public class URLUtil {
16
17     /**
18      * Normalizes the given url by replacing '/./' by '/' and removes trailing slashes
19      * @param original the original URL to be normalized
20      * @return the normalized url
21      */

22     public static URL JavaDoc normalize(URL JavaDoc original) {
23         URL JavaDoc normalized = null;
24
25         if (original != null) {
26
27             String JavaDoc urlString = original.toString ( );
28
29             urlString = normalizeDotFolders(urlString);
30             urlString = normalizeBackSlashes(urlString);
31             urlString = normalizeDoubleSlashes(urlString);
32             urlString = normalizeStripQuery(urlString) ;
33             //urlString = normalizeStripTrailingSlash(urlString) ;
34

35             try {
36                 normalized = new URL JavaDoc(urlString);
37             } catch (MalformedURLException JavaDoc e) {
38             }
39         }
40         return normalized;
41     }
42
43     /**
44      * Replaces all backslashes by front slashes in the given url string
45      * @param original the original url string
46      * @return the url string with the normalization applied
47      */

48     protected static String JavaDoc normalizeBackSlashes ( String JavaDoc original ) {
49         return StringUtil.replace(original, "\\", "/");
50     }
51
52     /**
53      * Replaces all double slashes by single slashes in the given url string
54      * @param original the original url string
55      * @return the url string with the normalization applied
56      */

57     protected static String JavaDoc normalizeDoubleSlashes ( String JavaDoc original ) {
58         return StringUtil.replace(original, "//", "/", 7);
59     }
60
61     /**
62      * Removes all dot folders ( abc/./def/index.html, ...) from the given
63      * url string
64      * @param original the original url string
65      * @return the url string with the normalization applied
66      */

67     protected static String JavaDoc normalizeDotFolders ( String JavaDoc original ) {
68         return StringUtil.replace(original, "/./", "/");
69     }
70
71     /**
72      * Strips an eventual query string from the resource (index.html?id=1
73      * becomes index.html for instance).
74      * @param original the original url string
75      * @return the url string with the normalization applied
76      */

77     protected static String JavaDoc normalizeStripQuery ( String JavaDoc original ) {
78         int index = original.indexOf('?');
79         if (index >= 0) {
80             return original.substring(0, index);
81         } else {
82             return original;
83         }
84     }
85
86     /**
87      * Removes an evantual trailing slash from the given url string
88      * @param original the original url string
89      * @return the url string with the normalization applied
90      */

91     protected static String JavaDoc normalizeStripTrailingSlash ( String JavaDoc original ) {
92         if (original.endsWith("/")) {
93             return original.substring(0, original.length() - 1);
94         } else {
95             return original;
96         }
97     }
98
99     /**
100      * Converts any resource URL to the site's url.
101      * @param resourceURL the url of the resource to find the url of the site for
102      * @return the URL pointing to the site in which the resource is located
103      */

104     public static URL JavaDoc getSiteURL(URL JavaDoc resourceURL) {
105         URL JavaDoc siteURL = null;
106         if (resourceURL != null) {
107             try {
108                 siteURL = new URL JavaDoc(resourceURL.getProtocol(), resourceURL.getHost(), resourceURL.getPort(), "");
109             } catch (MalformedURLException JavaDoc e) {
110                 // shouldn't happen, we're only dropping the PATH part of a valid URL ...
111
}
112         }
113         return siteURL;
114     }
115
116     /**
117      * Reuturns the URL of the robots.txt resource in the site of the given resource.
118      * @param resourceURL the URL of the resource to find the site's robots.txt of
119      * @return URL pointing to the robots.txt resource of the site in which resourceURL is
120      */

121     public static URL JavaDoc getRobotsTXTURL(URL JavaDoc resourceURL) {
122         URL JavaDoc retVal = null;
123         if (resourceURL != null) {
124             try {
125                 retVal = new URL JavaDoc(getSiteURL(resourceURL), "/robots.txt");
126             } catch (MalformedURLException JavaDoc e) {
127             }
128         }
129         return retVal;
130     }
131
132     /**
133      * returns the resource path without the resource.
134      * @param path the path to the resource
135      * @return path without the resource itself
136      */

137     public static String JavaDoc stripResource(String JavaDoc path) {
138         String JavaDoc result = null;
139         if (path != null) {
140             int pos = path.lastIndexOf("/");
141             result = path.substring(0, pos + 1);
142         }
143         return result;
144     }
145
146     /**
147      * Returns the 'depth' of the resource pointed to by the URL
148      * @param url the URL to the resource to calculate the depth of
149      * @return the depth of this resource in the site
150      */

151     public static int getDepth(URL JavaDoc url) {
152         int depth = 0;
153
154         if (url != null) {
155             String JavaDoc path = url.getPath();
156             if (!isFileSpecified(url) && !path.endsWith("/")) {
157                 path = path + "/";
158             }
159             int pos = path.indexOf('/');
160             while (pos != -1) {
161                 if (pos > 0) {
162                     depth++;
163                 }
164                 pos = path.indexOf('/', pos + 1);
165             }
166         }
167         return depth;
168     }
169
170     /**
171      * Determines whether a file is specified in the path part of the url.
172      * This is assumed to be the case if the string after the last slash
173      * contains a dot (aaaaa/bbbb/cccc.dddd).
174      * @param url the url to test
175      * @return boolean value indicating whether a file is specified
176      */

177     public static boolean isFileSpecified(URL JavaDoc url) {
178         boolean specified = false;
179
180         String JavaDoc path = url.getPath();
181         int posLastSlash = path.lastIndexOf('/');
182         int posLastDot = path.lastIndexOf('.');
183
184         specified = posLastDot > posLastSlash;
185
186         return specified;
187     }
188
189     /**
190      * Returns an array of Strings being the folder names of the folders
191      * found in the given URL.
192      * @param url the url to parse the folders of
193      * @return an array of Strings containing all folder names
194      */

195     public static String JavaDoc[] getFolderNames(URL JavaDoc url) {
196         url = normalize(url);
197         ArrayList JavaDoc al = new ArrayList JavaDoc();
198
199         String JavaDoc path = url.getPath();
200         if (isFileSpecified(url)) {
201             path = stripResource(path);
202         }
203         StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(path, "/");
204
205         while (st.hasMoreTokens()) {
206             al.add(st.nextToken());
207         }
208         return (String JavaDoc[]) al.toArray(new String JavaDoc[al.size()]);
209     }
210
211     /**
212      * Returns the file name (without the path) of the resource specified
213      * by the given url.
214      * @param url the url to get the filename out of
215      * @return String containing the name of the file, zero-length if none
216      */

217     public static String JavaDoc getFileName(URL JavaDoc url) {
218         return url.getPath().substring(stripResource(url.getPath()).length());
219     }
220
221 }
222
Popular Tags