KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > quadcap > http > client > LinkChecker


1 package com.quadcap.http.client;
2
3 /* Copyright 1998 - 2003 Quadcap Software. All rights reserved.
4  *
5  * This software is distributed under the Quadcap Free Software License.
6  * This software may be used or modified for any purpose, personal or
7  * commercial. Open Source redistributions are permitted. Commercial
8  * redistribution of larger works derived from, or works which bundle
9  * this software requires a "Commercial Redistribution License"; see
10  * http://www.quadcap.com/purchase.
11  *
12  * Redistributions qualify as "Open Source" under one of the following terms:
13  *
14  * Redistributions are made at no charge beyond the reasonable cost of
15  * materials and delivery.
16  *
17  * Redistributions are accompanied by a copy of the Source Code or by an
18  * irrevocable offer to provide a copy of the Source Code for up to three
19  * years at the cost of materials and delivery. Such redistributions
20  * must allow further use, modification, and redistribution of the Source
21  * Code under substantially the same terms as this license.
22  *
23  * Redistributions of source code must retain the copyright notices as they
24  * appear in each source code file, these license terms, and the
25  * disclaimer/limitation of liability set forth as paragraph 6 below.
26  *
27  * Redistributions in binary form must reproduce this Copyright Notice,
28  * these license terms, and the disclaimer/limitation of liability set
29  * forth as paragraph 6 below, in the documentation and/or other materials
30  * provided with the distribution.
31  *
32  * The Software is provided on an "AS IS" basis. No warranty is
33  * provided that the Software is free of defects, or fit for a
34  * particular purpose.
35  *
36  * Limitation of Liability. Quadcap Software shall not be liable
37  * for any damages suffered by the Licensee or any third party resulting
38  * from use of the Software.
39  */

40
41 import java.io.*;
42
43 import java.util.ArrayList JavaDoc;
44 import java.util.Collections JavaDoc;
45 import java.util.HashMap JavaDoc;
46 import java.util.Iterator JavaDoc;
47
48 import org.xml.sax.AttributeList JavaDoc;
49 import org.xml.sax.DocumentHandler JavaDoc;
50 import org.xml.sax.DTDHandler JavaDoc;
51 import org.xml.sax.EntityResolver JavaDoc;
52 import org.xml.sax.ErrorHandler JavaDoc;
53 import org.xml.sax.HandlerBase JavaDoc;
54 import org.xml.sax.InputSource JavaDoc;
55 import org.xml.sax.Locator JavaDoc;
56 import org.xml.sax.SAXException JavaDoc;
57
58 import com.quadcap.text.sax.Parser;
59
60 import com.quadcap.http.util.HeaderParser;
61
62 import com.quadcap.util.collections.ArrayQueue;
63 import com.quadcap.util.collections.DiGraph;
64
65 import com.quadcap.util.text.OctetMap;
66 import com.quadcap.util.text.Scanner;
67
68 import com.quadcap.util.Debug;
69 import com.quadcap.util.Util;
70
71
72 /**
73  * This class implements a simple link checker, following links
74  * in the following tags:
75  *
76  * <ul>
77  * <li><b>&lt;A HREF=""&gt;
78  * <li><b>&lt;IMG SRC=""&gt;
79  * <li><b>&lt;FRAME SRC=""&gt;
80  * </ul>
81  */

82 public class LinkChecker implements DocumentHandler JavaDoc {
83     /** uri of the document we're currently fetching and parsing */
84     String JavaDoc base;
85
86     /** base uri of the current document for relative href resolution */
87     String JavaDoc urlBase;
88
89     /** uri of the document we're currently fetching and parsing */
90     String JavaDoc currentUrl;
91
92     /** directed graph of all links found so far (even bad ones...) */
93     DiGraph links = new DiGraph();
94
95     /** queue of links to check */
96     ArrayQueue linksToCheck = new ArrayQueue();
97
98     /** uri -> status for all links */
99     HashMap JavaDoc allLinks = new HashMap JavaDoc();
100
101     /** uri -> status for completed links */
102     HashMap JavaDoc linksChecked = new HashMap JavaDoc();
103
104     Parser parser;
105     String JavaDoc host;
106         
107     public LinkChecker(String JavaDoc url) {
108         parser = new Parser();
109         String JavaDoc s = url;
110     if (s.startsWith("http://")) {
111         s = url.substring("http://".length());
112     }
113     int idx = s.indexOf('/');
114     if (idx > 0) s = s.substring(0, idx);
115     host = "http://" + s;
116     push(url, 0);
117     }
118
119     synchronized void push(String JavaDoc url, int line) {
120         if (allLinks.get(url) == null && url.startsWith(host)) {
121             System.out.println("PUSH " + trim(base) + " -> " + trim(url));
122             if (currentUrl != null) {
123                 links.addArc(currentUrl + ":" + line, url);
124             }
125             allLinks.put(url, "queued");
126             linksToCheck.push(url);
127         }
128     }
129
130     String JavaDoc trim(String JavaDoc url) {
131         if (url != null && url.startsWith(host)) {
132             url = url.substring(host.length());
133         }
134         return url;
135     }
136
137     public void printBadLinks() {
138         ArrayList JavaDoc k = new ArrayList JavaDoc();
139         Iterator JavaDoc iter = linksChecked.keySet().iterator();
140         while (iter.hasNext()) {
141             String JavaDoc url = iter.next().toString();
142             String JavaDoc val = linksChecked.get(url).toString();
143             if (!val.equals("found")) {
144                 Iterator JavaDoc x = links.getParents(url);
145                 String JavaDoc ref = x.hasNext() ? x.next().toString() : "";
146                 k.add(trim(ref) + "\n error: " + trim(url));
147             }
148         }
149         Collections.sort(k);
150         iter = k.iterator();
151         while (iter.hasNext()) {
152             System.out.println(iter.next().toString());
153         }
154         System.out.println("--------------------\n");
155         System.out.println("" + k.size() + " errors");
156     }
157     
158     public void run() throws Exception JavaDoc {
159     //HtmlParser parser = new HtmlParser();
160
int cnt = 0;
161     while (linksToCheck.size() > 0) {
162             System.out.print("" + (linksChecked.size()+1) + " of " +
163                              (linksToCheck.size() + linksChecked.size()) +
164                              ": ");
165         String JavaDoc url = linksToCheck.popBack().toString();
166             if (linksChecked.get(url) != null) continue;
167             System.out.println(trim(url));
168             currentUrl = url;
169         InputStream is = null;
170         try {
171                 is = HttpFetcher.fetchStream(url);
172         Scanner scanner = new Scanner(is);
173         HashMap JavaDoc headers = new HashMap JavaDoc();
174         scanner.skipUntil(OctetMap.wsChars);
175         scanner.skipWhile(OctetMap.wsChars);
176         String JavaDoc resp = scanner.parseUntil(OctetMap.crlfChars);
177         HeaderParser.parseCRLF(scanner);
178         HeaderParser.parseHeaders(scanner, headers);
179         if (!resp.startsWith("200")) {
180                     allLinks.put(url, "missing");
181                     linksChecked.put(url, "missing");
182                     Iterator JavaDoc iter = links.getParents(url);
183                     String JavaDoc referrer =
184                         iter.hasNext()
185                         ? iter.next().toString()
186                         : "---";
187             System.err.println("*** " + trim(url) + "," +
188                                        trim(referrer) + "," + resp);
189             continue;
190         }
191         String JavaDoc mimeType = (String JavaDoc)headers.get("content-type");
192         if (mimeType == null || !mimeType.equals("text/html")) {
193             continue;
194         }
195         InputStreamReader r = new InputStreamReader(is);
196         InputSource JavaDoc in = new InputSource JavaDoc(r);
197         in.setSystemId(url);
198         parser.setDocumentHandler(this);
199         setBase(url);
200         parser.parse(in);
201                 allLinks.put(url, "found");
202                 linksChecked.put(url, "found");
203             } catch (IOException e) {
204                 Debug.print(e);
205                 allLinks.put(url, "error");
206                 linksChecked.put(url, "error");
207             } catch (Exception JavaDoc e3) {
208                 Debug.print(e3);
209                 allLinks.put(url, "exception");
210                 linksChecked.put(url, "exception");
211             } catch (Throwable JavaDoc t) {
212                 Debug.print(t);
213                 allLinks.put(url, "exception");
214                 linksChecked.put(url, "exception");
215         } finally {
216         if (is != null) is.close();
217                 //System.out.println("Result: " + allLinks.get(url));
218
}
219     }
220     }
221     
222     public void setBase(String JavaDoc base) {
223     this.base = base;
224     this.urlBase = parent(base);
225     if (base.endsWith("/")) urlBase = base;
226     }
227
228     public void startDocument() {
229     }
230
231     public void endDocument() {
232     }
233
234     public void ignorableWhitespace(char[] ch, int off, int cnt)
235     throws SAXException JavaDoc
236     {
237     characters(ch, off, cnt);
238     }
239
240     public void processingInstruction(String JavaDoc target, String JavaDoc data) {
241     }
242
243     public void setDocumentLocator(Locator JavaDoc locator) {
244     }
245
246     public void startElement(String JavaDoc tag, AttributeList JavaDoc attrs)
247     throws SAXException JavaDoc
248     {
249     try {
250         if (tag.equalsIgnoreCase("a")) {
251         String JavaDoc href = attrs.getValue("href");
252         if (href != null) checkHref(href, parser.getLineNumber());
253         } else if (tag.equalsIgnoreCase("img") ||
254                tag.equalsIgnoreCase("frame")) {
255         String JavaDoc href = attrs.getValue("src");
256         if (href != null) checkHref(href, parser.getLineNumber());
257         }
258     } catch (Throwable JavaDoc t) {
259         t.printStackTrace(System.err);
260         //System.err.println(t.toString());
261
System.err.println("tag = " + tag);
262         System.err.println("attrs = " + attrs);
263         System.err.println("urlBase = " + urlBase);
264     }
265     }
266    
267     public void characters(char[] ch, int off, int len) throws SAXException JavaDoc {
268     }
269
270     public void endElement(String JavaDoc tag) throws SAXException JavaDoc {
271     }
272
273     public void checkHref(String JavaDoc href, int line) {
274     String JavaDoc tbase = urlBase;
275     href = href.trim();
276     if (href.length() > 0 && href.charAt(0) == '/') {
277         href = href.substring(1);
278             tbase = "";
279     } else if (href.startsWith("http://")) {
280         tbase = "";
281     } else if (href.startsWith("ftp://") ||
282         href.startsWith("mailto:")) {
283         return;
284     } else {
285         while (href.startsWith("./") || href.startsWith("../")) {
286         if (href.startsWith("./")) {
287             href = href.substring(2);
288         } else if (href.startsWith("../")) {
289             href = href.substring(3);
290             tbase = parent(tbase);
291         }
292         }
293     }
294     String JavaDoc url = tbase + href;
295     int idx = url.indexOf('#');
296     if (idx >= 0) {
297         url = url.substring(0, idx);
298     }
299     if (url.length() == 0) return;
300         push(url, line);
301     }
302
303     static String JavaDoc parent(String JavaDoc s) {
304     for (int i = s.length() - 2; i >= 0; i--) {
305         if (s.charAt(i) == '/') return s.substring(0, i+1);
306     }
307     throw new RuntimeException JavaDoc("Bad parent: " + s);
308     }
309 }
310
311
Popular Tags