1 package com.quadcap.http.client; 2 3 40 41 import java.io.*; 42 43 import java.util.ArrayList ; 44 import java.util.Collections ; 45 import java.util.HashMap ; 46 import java.util.Iterator ; 47 48 import org.xml.sax.AttributeList ; 49 import org.xml.sax.DocumentHandler ; 50 import org.xml.sax.DTDHandler ; 51 import org.xml.sax.EntityResolver ; 52 import org.xml.sax.ErrorHandler ; 53 import org.xml.sax.HandlerBase ; 54 import org.xml.sax.InputSource ; 55 import org.xml.sax.Locator ; 56 import org.xml.sax.SAXException ; 57 58 import com.quadcap.text.sax.Parser; 59 60 import com.quadcap.http.util.HeaderParser; 61 62 import com.quadcap.util.collections.ArrayQueue; 63 import com.quadcap.util.collections.DiGraph; 64 65 import com.quadcap.util.text.OctetMap; 66 import com.quadcap.util.text.Scanner; 67 68 import com.quadcap.util.Debug; 69 import com.quadcap.util.Util; 70 71 72 82 public class LinkChecker implements DocumentHandler { 83 84 String base; 85 86 87 String urlBase; 88 89 90 String currentUrl; 91 92 93 DiGraph links = new DiGraph(); 94 95 96 ArrayQueue linksToCheck = new ArrayQueue(); 97 98 99 HashMap allLinks = new HashMap (); 100 101 102 HashMap linksChecked = new HashMap (); 103 104 Parser parser; 105 String host; 106 107 public LinkChecker(String url) { 108 parser = new Parser(); 109 String s = url; 110 if (s.startsWith("http://")) { 111 s = url.substring("http://".length()); 112 } 113 int idx = s.indexOf('/'); 114 if (idx > 0) s = s.substring(0, idx); 115 host = "http://" + s; 116 push(url, 0); 117 } 118 119 synchronized void push(String url, int line) { 120 if (allLinks.get(url) == null && url.startsWith(host)) { 121 System.out.println("PUSH " + trim(base) + " -> " + trim(url)); 122 if (currentUrl != null) { 123 links.addArc(currentUrl + ":" + line, url); 124 } 125 allLinks.put(url, "queued"); 126 linksToCheck.push(url); 127 } 128 } 129 130 String trim(String url) { 131 if (url != null && url.startsWith(host)) { 132 url = url.substring(host.length()); 133 } 134 return url; 135 } 136 137 public void printBadLinks() { 138 ArrayList k = new ArrayList (); 139 Iterator iter = linksChecked.keySet().iterator(); 140 while (iter.hasNext()) { 141 String url = iter.next().toString(); 142 String val = linksChecked.get(url).toString(); 143 if (!val.equals("found")) { 144 Iterator x = links.getParents(url); 145 String ref = x.hasNext() ? x.next().toString() : ""; 146 k.add(trim(ref) + "\n error: " + trim(url)); 147 } 148 } 149 Collections.sort(k); 150 iter = k.iterator(); 151 while (iter.hasNext()) { 152 System.out.println(iter.next().toString()); 153 } 154 System.out.println("--------------------\n"); 155 System.out.println("" + k.size() + " errors"); 156 } 157 158 public void run() throws Exception { 159 int cnt = 0; 161 while (linksToCheck.size() > 0) { 162 System.out.print("" + (linksChecked.size()+1) + " of " + 163 (linksToCheck.size() + linksChecked.size()) + 164 ": "); 165 String url = linksToCheck.popBack().toString(); 166 if (linksChecked.get(url) != null) continue; 167 System.out.println(trim(url)); 168 currentUrl = url; 169 InputStream is = null; 170 try { 171 is = HttpFetcher.fetchStream(url); 172 Scanner scanner = new Scanner(is); 173 HashMap headers = new HashMap (); 174 scanner.skipUntil(OctetMap.wsChars); 175 scanner.skipWhile(OctetMap.wsChars); 176 String resp = scanner.parseUntil(OctetMap.crlfChars); 177 HeaderParser.parseCRLF(scanner); 178 HeaderParser.parseHeaders(scanner, headers); 179 if (!resp.startsWith("200")) { 180 allLinks.put(url, "missing"); 181 linksChecked.put(url, "missing"); 182 Iterator iter = links.getParents(url); 183 String referrer = 184 iter.hasNext() 185 ? iter.next().toString() 186 : "---"; 187 System.err.println("*** " + trim(url) + "," + 188 trim(referrer) + "," + resp); 189 continue; 190 } 191 String mimeType = (String )headers.get("content-type"); 192 if (mimeType == null || !mimeType.equals("text/html")) { 193 continue; 194 } 195 InputStreamReader r = new InputStreamReader(is); 196 InputSource in = new InputSource (r); 197 in.setSystemId(url); 198 parser.setDocumentHandler(this); 199 setBase(url); 200 parser.parse(in); 201 allLinks.put(url, "found"); 202 linksChecked.put(url, "found"); 203 } catch (IOException e) { 204 Debug.print(e); 205 allLinks.put(url, "error"); 206 linksChecked.put(url, "error"); 207 } catch (Exception e3) { 208 Debug.print(e3); 209 allLinks.put(url, "exception"); 210 linksChecked.put(url, "exception"); 211 } catch (Throwable t) { 212 Debug.print(t); 213 allLinks.put(url, "exception"); 214 linksChecked.put(url, "exception"); 215 } finally { 216 if (is != null) is.close(); 217 } 219 } 220 } 221 222 public void setBase(String base) { 223 this.base = base; 224 this.urlBase = parent(base); 225 if (base.endsWith("/")) urlBase = base; 226 } 227 228 public void startDocument() { 229 } 230 231 public void endDocument() { 232 } 233 234 public void ignorableWhitespace(char[] ch, int off, int cnt) 235 throws SAXException  236 { 237 characters(ch, off, cnt); 238 } 239 240 public void processingInstruction(String target, String data) { 241 } 242 243 public void setDocumentLocator(Locator locator) { 244 } 245 246 public void startElement(String tag, AttributeList attrs) 247 throws SAXException  248 { 249 try { 250 if (tag.equalsIgnoreCase("a")) { 251 String href = attrs.getValue("href"); 252 if (href != null) checkHref(href, parser.getLineNumber()); 253 } else if (tag.equalsIgnoreCase("img") || 254 tag.equalsIgnoreCase("frame")) { 255 String href = attrs.getValue("src"); 256 if (href != null) checkHref(href, parser.getLineNumber()); 257 } 258 } catch (Throwable t) { 259 t.printStackTrace(System.err); 260 System.err.println("tag = " + tag); 262 System.err.println("attrs = " + attrs); 263 System.err.println("urlBase = " + urlBase); 264 } 265 } 266 267 public void characters(char[] ch, int off, int len) throws SAXException { 268 } 269 270 public void endElement(String tag) throws SAXException { 271 } 272 273 public void checkHref(String href, int line) { 274 String tbase = urlBase; 275 href = href.trim(); 276 if (href.length() > 0 && href.charAt(0) == '/') { 277 href = href.substring(1); 278 tbase = ""; 279 } else if (href.startsWith("http://")) { 280 tbase = ""; 281 } else if (href.startsWith("ftp://") || 282 href.startsWith("mailto:")) { 283 return; 284 } else { 285 while (href.startsWith("./") || href.startsWith("../")) { 286 if (href.startsWith("./")) { 287 href = href.substring(2); 288 } else if (href.startsWith("../")) { 289 href = href.substring(3); 290 tbase = parent(tbase); 291 } 292 } 293 } 294 String url = tbase + href; 295 int idx = url.indexOf('#'); 296 if (idx >= 0) { 297 url = url.substring(0, idx); 298 } 299 if (url.length() == 0) return; 300 push(url, line); 301 } 302 303 static String parent(String s) { 304 for (int i = s.length() - 2; i >= 0; i--) { 305 if (s.charAt(i) == '/') return s.substring(0, i+1); 306 } 307 throw new RuntimeException ("Bad parent: " + s); 308 } 309 } 310 311
| Popular Tags
|