1 24 25 27 package weblech.spider; 28 29 import org.apache.log4j.Category; 30 31 import java.util.List ; 32 import java.util.ArrayList ; 33 import java.util.HashSet ; 34 import java.util.Set ; 35 import java.net.URL ; 36 import java.net.MalformedURLException ; 37 import java.io.ByteArrayInputStream ; 38 import java.io.IOException ; 39 import java.io.FileWriter ; 40 import java.io.PrintWriter ; 41 42 import weblech.util.Log4j; 43 44 public class HTMLParser 45 { 46 private final static Category _logClass = Category.getInstance(URLObject.class); 47 48 private SpiderConfig config; 49 50 static 51 { 52 Log4j.init(); 53 } 54 55 public HTMLParser(SpiderConfig config) 56 { 57 this.config = config; 58 } 59 60 public List parseLinksInDocument(URL sourceURL, String textContent) 61 { 62 return parseAsHTML(sourceURL, textContent); 63 } 64 65 private List parseAsHTML(URL sourceURL, String textContent) 66 { 67 _logClass.debug("parseAsHTML()"); 68 ArrayList newURLs = new ArrayList (); 69 HashSet newURLSet = new HashSet (); 70 71 extractAttributesFromTags("img", "src", sourceURL, newURLs, newURLSet, textContent); 72 extractAttributesFromTags("a", "href", sourceURL, newURLs, newURLSet, textContent); 73 extractAttributesFromTags("body", "background", sourceURL, newURLs, newURLSet, textContent); 74 extractAttributesFromTags("frame", "src", sourceURL, newURLs, newURLSet, textContent); 75 extractAttributesFromTags("IMG", "SRC", sourceURL, newURLs, newURLSet, textContent); 76 extractAttributesFromTags("A", "HREF", sourceURL, newURLs, newURLSet, textContent); 77 extractAttributesFromTags("BODY", "BACKGROUND", sourceURL, newURLs, newURLSet, textContent); 78 extractAttributesFromTags("FRAME", "SRC", sourceURL, newURLs, newURLSet, textContent); 79 80 if(newURLs.size() == 0) 81 { 82 _logClass.debug("Got 0 new URLs from HTML parse, check HTML\n" + textContent); 83 } 84 _logClass.debug("Returning " + newURLs.size() + " urls extracted from page"); 85 return newURLs; 86 } 87 88 private void extractAttributesFromTags(String tag, String attr, URL sourceURL, List newURLs, Set newURLSet, String input) 89 { 90 _logClass.debug("extractAttributesFromTags(" + tag + ", " + attr + ", ...)"); 91 92 int startPos = 0; 93 String startTag = "<" + tag + " "; 94 String attrStr = attr + "=\""; 95 while(true) 96 { 97 int tagPos = input.indexOf(startTag, startPos); 98 if(tagPos < 0) 99 { 100 return; 101 } 102 int attrPos = input.indexOf(attrStr, tagPos + 1); 103 if(attrPos < 0) 104 { 105 startPos = tagPos + 1; 106 continue; 107 } 108 int nextClosePos = input.indexOf(">", tagPos + 1); 109 if(attrPos < nextClosePos) 110 { 111 int closeQuotePos = input.indexOf("\"", attrPos + attrStr.length() + 1); 113 if(closeQuotePos > 0) 114 { 115 String urlStr = input.substring(attrPos + attrStr.length(), closeQuotePos); 116 if(urlStr.indexOf('#') != -1) 117 { 118 urlStr = urlStr.substring(0, urlStr.indexOf('#')); 119 } 120 122 if(isMailTo(urlStr)) 123 { 124 logMailURL(urlStr); 125 } 126 else 127 { 128 try 129 { 130 131 URL u = new URL (sourceURL, urlStr); 132 if(newURLSet.contains(u)) 133 { 134 } 136 else 137 { 138 newURLs.add(u); 139 newURLSet.add(u); 140 } 142 } 143 catch(MalformedURLException murle) 144 { 145 } 146 } 147 } 148 startPos = tagPos + 1; 149 continue; 150 } 151 else 152 { 153 startPos = tagPos + 1; 154 continue; 155 } 156 } 157 } 158 159 private void logMailURL(String url) 160 { 161 _logClass.debug("logMailURL()"); 162 163 try 164 { 165 FileWriter appendedFile = new FileWriter (config.getMailtoLogFile().toString(), true); 166 PrintWriter pW = new PrintWriter (appendedFile); 167 pW.println(url); 168 pW.flush(); 169 pW.close(); 170 } 171 catch(IOException ioe) 172 { 173 _logClass.warn("Caught IO exception writing mailto URL:" + ioe.getMessage(), ioe); 174 } 175 } 176 177 180 private boolean isMailTo(String url) 181 { 182 if(url == null) 183 { 184 return false; 185 } 186 187 url = url.toUpperCase(); 188 return (url.indexOf("MAILTO:") != -1); 189 } 190 } 191 | Popular Tags |