KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > javacoding > jspider > core > util > html > URLFinder


1 package net.javacoding.jspider.core.util.html;
2
3 import net.javacoding.jspider.core.util.URLUtil;
4
5 import java.net.MalformedURLException JavaDoc;
6 import java.net.URL JavaDoc;
7 import java.util.StringTokenizer JavaDoc;
8
9 /**
10  * $Id: URLFinder.java,v 1.9 2003/04/10 16:19:17 vanrogu Exp $
11  */

12 public class URLFinder {
13
14     public static final String JavaDoc basePattern = "<base HREF=";
15
16     public static final String JavaDoc[] patterns = {
17       "href=",
18       "src=",
19       "background="
20     };
21
22     public static void findURLs(URLFinderCallback callback, String JavaDoc line) {
23         findBase(callback, line, basePattern);
24         for (int i = 0; i < patterns.length; i++) {
25             String JavaDoc pattern = patterns[i];
26             findURLs(callback, line, pattern);
27         }
28     }
29
30     protected static void findBase(URLFinderCallback callback, String JavaDoc line, String JavaDoc pattern) {
31         String JavaDoc lineLowerCase = line.toLowerCase();
32         int pos = lineLowerCase.indexOf(pattern);
33         if ( pos != -1 ) {
34             String JavaDoc url = "";
35             try {
36                 url = extractURL(line, pos + pattern.length());
37                 URL JavaDoc baseURL = URLUtil.normalize(new URL JavaDoc(url));
38                 callback.setContextURL(baseURL);
39             } catch (MalformedURLException JavaDoc e) {
40                 callback.malformedContextURLFound(url);
41             }
42         }
43     }
44
45     protected static void findURLs(URLFinderCallback callback, String JavaDoc line, String JavaDoc pattern) {
46         String JavaDoc lineLowerCase = line.toLowerCase();
47         int pos = lineLowerCase.indexOf(pattern);
48         while (pos != -1) {
49             String JavaDoc uri = "";
50             try {
51                 uri = extractURL(line, pos + pattern.length());
52                 URL JavaDoc baseURL = callback.getContextURL();
53                 if ( ! URLUtil.isFileSpecified(baseURL)) {
54                 // Force a slash in case of a folder (to avoid buggy relative refs)
55
baseURL = new URL JavaDoc(baseURL.toString() + "/");
56                 }
57                 URL JavaDoc foundURL = URLUtil.normalize(new URL JavaDoc(baseURL, uri));
58                 callback.urlFound(foundURL);
59             } catch (MalformedURLException JavaDoc e) {
60                 callback.malformedUrlFound(uri);
61             }
62             pos = lineLowerCase.indexOf(pattern, pos + pattern.length());
63         }
64     }
65
66     protected static String JavaDoc extractURL(String JavaDoc string, int pos) {
67         char c = string.charAt(pos);
68         String JavaDoc ret = "";
69         if (c == '\'' || c == '"') {
70             string = string.substring(pos + 1);
71         } else {
72             string = string.substring(pos);
73         }
74         if (string.length() > 0) {
75             c = string.charAt(0);
76             if (c == '\'' || c == '\"' || c == '>') {
77                 ret = "";
78             } else {
79                 StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(string, " \"\'>");
80                 ret = st.nextToken();
81             }
82         }
83         int p = ret.indexOf('#');
84         if (p > -1) {
85             return ret.substring(0, p);
86         } else {
87             return ret;
88         }
89     }
90
91 }
92
Popular Tags