1 package net.javacoding.jspider.core.util.html; 2 3 import net.javacoding.jspider.core.util.URLUtil; 4 5 import java.net.MalformedURLException ; 6 import java.net.URL ; 7 import java.util.StringTokenizer ; 8 9 12 public class URLFinder { 13 14 public static final String basePattern = "<base HREF="; 15 16 public static final String [] patterns = { 17 "href=", 18 "src=", 19 "background=" 20 }; 21 22 public static void findURLs(URLFinderCallback callback, String line) { 23 findBase(callback, line, basePattern); 24 for (int i = 0; i < patterns.length; i++) { 25 String pattern = patterns[i]; 26 findURLs(callback, line, pattern); 27 } 28 } 29 30 protected static void findBase(URLFinderCallback callback, String line, String pattern) { 31 String lineLowerCase = line.toLowerCase(); 32 int pos = lineLowerCase.indexOf(pattern); 33 if ( pos != -1 ) { 34 String url = ""; 35 try { 36 url = extractURL(line, pos + pattern.length()); 37 URL baseURL = URLUtil.normalize(new URL (url)); 38 callback.setContextURL(baseURL); 39 } catch (MalformedURLException e) { 40 callback.malformedContextURLFound(url); 41 } 42 } 43 } 44 45 protected static void findURLs(URLFinderCallback callback, String line, String pattern) { 46 String lineLowerCase = line.toLowerCase(); 47 int pos = lineLowerCase.indexOf(pattern); 48 while (pos != -1) { 49 String uri = ""; 50 try { 51 uri = extractURL(line, pos + pattern.length()); 52 URL baseURL = callback.getContextURL(); 53 if ( ! URLUtil.isFileSpecified(baseURL)) { 54 baseURL = new URL (baseURL.toString() + "/"); 56 } 57 URL foundURL = URLUtil.normalize(new URL (baseURL, uri)); 58 callback.urlFound(foundURL); 59 } catch (MalformedURLException e) { 60 callback.malformedUrlFound(uri); 61 } 62 pos = lineLowerCase.indexOf(pattern, pos + pattern.length()); 63 } 64 } 65 66 protected static String extractURL(String string, int pos) { 67 char c = string.charAt(pos); 68 String ret = ""; 69 if (c == '\'' || c == '"') { 70 string = string.substring(pos + 1); 71 } else { 72 string = string.substring(pos); 73 } 74 if (string.length() > 0) { 75 c = string.charAt(0); 76 if (c == '\'' || c == '\"' || c == '>') { 77 ret = ""; 78 } else { 79 StringTokenizer st = new StringTokenizer (string, " \"\'>"); 80 ret = st.nextToken(); 81 } 82 } 83 int p = ret.indexOf('#'); 84 if (p > -1) { 85 return ret.substring(0, p); 86 } else { 87 return ret; 88 } 89 } 90 91 } 92 | Popular Tags |