KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > openharmonise > rm > resources > content > utils > LinkChecker


1 /*
2  * The contents of this file are subject to the
3  * Mozilla Public License Version 1.1 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at http://www.mozilla.org/MPL/
6  *
7  * Software distributed under the License is distributed on an "AS IS"
8  * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
9  * See the License for the specific language governing rights and
10  * limitations under the License.
11  *
12  * The Initial Developer of the Original Code is Simulacra Media Ltd.
13  * Portions created by Simulacra Media Ltd are Copyright (C) Simulacra Media Ltd, 2004.
14  *
15  * All Rights Reserved.
16  *
17  * Contributor(s):
18  *
19  * Created: 29-Nov-2004 by jejking
20  * Version: $Revision: 1.5 $
21  * Last Updated: $Date: 2005/01/10 12:28:36 $
22  */

23 package org.openharmonise.rm.resources.content.utils;
24
25
26 import java.io.IOException JavaDoc;
27 import java.net.*;
28 import java.text.*;
29 import java.util.*;
30 import java.util.logging.*;
31 import java.util.regex.*;
32
33 import javax.xml.parsers.*;
34
35 import org.openharmonise.rm.DataAccessException;
36 import org.openharmonise.rm.resources.content.Asset;
37 import org.w3c.dom.*;
38
39 /**
40  * URL Checking utility class.
41  *
42  * <p>
43  * Class runs through a list of Strings which are meant to be <code>URL</code>
44  * s, specifically http URLs (weblinks). For each String , it attempts to
45  * construct a <code>URL</code> object, and then attempts to connect to the
46  * resource represented by the URL. If any exceptions are thrown or if the HTTP
47  * response code is any other than 200 (OK), then the class records the String
48  * representing the URL and details about the issue. The code does <em>not</em>
49  * follow redirects as it cannot be assumed that all clients using the Timelines
50  * website will do so, but instead notes the new URL specified in the Location
51  * header.
52  * </p>
53  *
54  * <p>
55  * Once the class has finished connecting to all the URLs in the List passed in,
56  * it constructs a report on request. A standard "OK" report is produced if no
57  * issues were recorded, otherwise a report is constructed detailing the
58  * malfunctioning URLs and the problems encountered.
59  * </p>
60  *
61  * <p>
62  * <em>Note:</em> this class is part of the implementation of the BM
63  * requirement 3.41 as clarified in the SIM requirements clarification document,
64  * point 3.5.2.
65  * </p>
66  *
67  * Copyright SimulacraMedia 2003
68  *
69  * @author John King
70  * @version $Revision: 1.5 $
71  */

72 public class LinkChecker {
73     private List assetsToCheck;
74
75     private List errorsList; // holds any LinkError objects produced
76

77     private boolean bURLsChecked = false; // state flag, no reports can be
78
// produced until links have been
79
// checked
80

81     private Date dateRun;
82
83     private DateFormat dateFormat;
84
85     private DateFormat xmlDateTimeFormat;
86
87     private Hashtable errorCodes; // holds the http error codes
88

89     /**
90      * Logger for this class
91      */

92     private static final Logger m_logger = Logger.getLogger(LinkChecker.class
93             .getName());
94
95     {
96         errorCodes = new Hashtable();
97         errorCodes
98                 .put(
99                         new Integer JavaDoc(201),
100                         "Following a POST command, this indicates success, but the textual part of the response line indicates the URI by which the newly created document should be known.");
101         errorCodes
102                 .put(
103                         new Integer JavaDoc(202),
104                         "The request has been accepted for processing, but the processing has not been completed. The request may or may not eventually be acted upon, as it may be disallowed when processing actually takes place. there is no facility for status returns from asynchronous operations such as this.");
105         errorCodes
106                 .put(
107                         new Integer JavaDoc(203),
108                         "When received in the response to a GET command, this indicates that the returned metainformation is not a definitive set of the object from a server with a copy of the object, but is from a private overlaid web. This may include annotation information about the object, for example.");
109         errorCodes
110                 .put(
111                         new Integer JavaDoc(204),
112                         "Server has received the request but there is no information to send back, and the client should stay in the same document view. This is mainly to allow input for scripts without changing the document at the same time.");
113         errorCodes.put(new Integer JavaDoc(300), "Multiple Choices");
114         errorCodes
115                 .put(new Integer JavaDoc(301),
116                         "The requested resource has been assigned the following new URL: ");
117         errorCodes.put(new Integer JavaDoc(302),
118                 "The requested resource resides temporarily under the ");
119         errorCodes.put(new Integer JavaDoc(304), "304 Not Modified");
120         errorCodes
121                 .put(new Integer JavaDoc(305),
122                         "The requested resource MUST be accessed through the proxy given by ");
123         errorCodes.put(new Integer JavaDoc(306), "306 (Unused)");
124         errorCodes
125                 .put(new Integer JavaDoc(307),
126                         "The requested resource resides temporarily under the following URI: ");
127         errorCodes
128                 .put(new Integer JavaDoc(400),
129                         "The request had bad syntax or was inherently impossible to be satisfied.");
130         errorCodes
131                 .put(
132                         new Integer JavaDoc(401),
133                         "The parameter to this message gives a specification of authorization schemes which are acceptable. The client should retry the request with a suitable Authorization header.");
134         errorCodes
135                 .put(
136                         new Integer JavaDoc(402),
137                         "The parameter to this message gives a specification of charging schemes acceptable. The client may retry the request with a suitable ChargeTo header.");
138         errorCodes
139                 .put(new Integer JavaDoc(403),
140                         "The request is for something forbidden. Authorization will not help.");
141         errorCodes.put(new Integer JavaDoc(404),
142                 "The server has not found anything matching the URI given");
143         errorCodes
144                 .put(
145                         new Integer JavaDoc(500),
146                         "The server encountered an unexpected condition which prevented it from fulfilling the request.");
147         errorCodes.put(new Integer JavaDoc(501),
148                 "The server does not support the facility required.");
149         errorCodes
150                 .put(
151                         new Integer JavaDoc(502),
152                         "The server cannot process the request due to a high load (whether HTTP servicing or other requests). The implication is that this is a temporary condition which maybe alleviated at other times.");
153         errorCodes
154                 .put(
155                         new Integer JavaDoc(503),
156                         "This is equivalent to Internal Error 500, but in the case of a server which is in turn accessing some other service, this indicates that the respose from the other service did not return within a time that the gateway was prepared to wait. As from the point of view of the clientand the HTTP transaction the other service is hidden within the server, this maybe treated identically to Internal error 500, but has more diagnostic value.");
157     }
158
159     /**
160      * Constructs a new <code>LinkChecker</code> instance.
161      *
162      * @param urlsToCheck
163      * list of URLs to check in the form of String objects
164      * @throws NullPointerException
165      * if urlsToCheck was null
166      */

167     public LinkChecker(List assetsToCheck) {
168         if (assetsToCheck == null) {
169             //TODO make this fit into Papaya exception handling patterns
170
throw new NullPointerException JavaDoc("urlsToCheck was null");
171         }
172         this.assetsToCheck = assetsToCheck;
173         errorsList = new ArrayList();
174         dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm");
175         xmlDateTimeFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
176     }
177
178     /**
179      * Checks the links specified in the list of URLs specified in the call to
180      * the constructor.
181      *
182      * <p>
183      * Iterates through the list of Strings supplied. It first attempts to
184      * construct a URL object using the string. If this fails, due to a
185      * MalformedURLException, a <code>LinkError</code> is created and added to
186      * the errors list. If the URL object is created successfully, it attempts
187      * to connect to to it. If an attempt to connect results in an Exception or
188      * if the connection succeeds but returns any HTTP status code other than
189      * 200 (OK), then a <code>LinkError</code> is created and noted in the
190      * errors list.
191      * </p>
192      *
193      * <p>
194      * This implementation attempts to be robust. It anticipates that users will
195      * not necessarily provide a full, compliant URL but may abbreviate web
196      * links to, say, www.example.com rather than http://www.example.com. It
197      * looks for the :// characters which mark the boundary between the protocol
198      * specifier and the machine address and if these are missing, prepends
199      * http:// to the string before checking to form, hopefully, a syntactically
200      * correct URL before attempting to resolve the link.
201      * </p>
202      *
203      * @see org.openharmonise.rm.resources.content.utils.LinkError
204      */

205     public void checkLinks() {
206         // go through the list of urls and check each one
207
Iterator it = assetsToCheck.iterator();
208         Pattern pattern = Pattern.compile(".*://.*");
209         HttpURLConnection.setFollowRedirects(false);
210         while (it.hasNext()) {
211             Asset asset = (Asset) it.next();
212             String JavaDoc URLString = null;
213             try {
214                 URLString = asset.getFullURL();
215             } catch (DataAccessException dae) {
216                 createError(asset, dae);
217                 handleException(dae);
218             }
219
220             /*
221              * it is a very strong possibility that users will not provide
222              * correctly formed URLs because they draw on their browsers which
223              * turn www.theregister.co.uk into http://www.theregister.co.uk
224              * behind the scenes. We must anticipate this and if a protocol
225              * element is missing, we will preprend http://
226              */

227             Matcher matcher = pattern.matcher(URLString);
228             if (!matcher.matches()) {
229                 // we probably don't have a protocol element, so we'll prepend
230
// one
231
URLString = "http://" + URLString;
232             }
233
234             try {
235                 
236                 URL url = new URL(URLString);
237                 
238                 String JavaDoc sHost = url.getHost();
239                 
240                 if(sHost.indexOf(" ") >= 0 || sHost.indexOf(".") < 0) {
241                     throw new MalformedURLException("'" + URLString + "' is a malformed URL");
242                 }
243                 
244                 HttpURLConnection conn = (HttpURLConnection) url
245                         .openConnection();
246                 int status = conn.getResponseCode();
247                 if (status != HttpURLConnection.HTTP_OK) {
248                     createError(asset, conn);
249                 }
250                 
251             } catch (MalformedURLException mue) {
252                 createError(asset, mue);
253             } catch (IOException JavaDoc ioe) {
254                 createError(asset, ioe);
255                 handleException(ioe);
256             } catch (ClassCastException JavaDoc cce) { // particularly indicative of non
257
// web URL
258
createError(asset, cce);
259                 handleException(cce);
260             }
261         }
262         bURLsChecked = true;
263         dateRun = new Date();
264     }
265
266     /**
267      * Creates a simple dated, plain text formatted report
268      *
269      * <p>
270      * Produces a dated simple report on the outcome of the link checking
271      * activity. If there were no errors, then this is stated. If there were
272      * errors, then each is noted along with the URL concerned.
273      * </p>
274      *
275      * @return a dated, plain text formatted report
276      * @throws IllegalStateException
277      * if the <code>checkLinks</code> method has not previously
278      * been called
279      */

280     public String JavaDoc getReport() { //TODO this is an interim and very basic plain
281
// text report
282
if (bURLsChecked == false) {
283             throw new IllegalStateException JavaDoc("URLs have not yet been checked");
284         }
285         StringBuffer JavaDoc report = new StringBuffer JavaDoc(1500);
286
287         report.append("Link Checking Report\n");
288         report.append("Run at: ");
289         report.append(dateFormat.format(dateRun));
290         report.append("\n");
291
292         if (errorsList.size() == 0) { // ie, no errors were recorded
293
report.append("No errors were detected");
294         } else { // else, we had errors, put them in the report
295
Iterator errorsIt = errorsList.iterator();
296             while (errorsIt.hasNext()) {
297                 try {
298                     report.append("\n");
299                     LinkStatus status = (LinkStatus) errorsIt.next();
300                     report.append(status.getAsset().getURI());
301                     report.append("\n");
302                     report.append(status.getErrorMessage());
303                     if (status.getNewURL() != null) {
304                         report.append("\nNew Location: " + status.getNewURL());
305                     }
306                     report.append("\n");
307                 } catch (DataAccessException e) {
308                     m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
309                 }
310             }
311         }
312         return report.toString();
313     }
314
315     public Document getXMLReport() {
316         if (bURLsChecked == false) {
317             throw new IllegalStateException JavaDoc("URLs have not yet been checked");
318         }
319         Document doc = null;
320
321         try {
322             DocumentBuilderFactory factory = DocumentBuilderFactory
323                     .newInstance();
324             DocumentBuilder docBuilder = factory.newDocumentBuilder();
325             doc = docBuilder.newDocument();
326         } catch (FactoryConfigurationError e) {
327             m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
328         } catch (ParserConfigurationException e) {
329             m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
330         }
331
332         Element reportElement = doc.createElement("ReportInstance");
333         doc.appendChild(reportElement);
334         reportElement.setAttribute("date", xmlDateTimeFormat.format(dateRun));
335         Element listElement = doc.createElement("List");
336         reportElement.appendChild(listElement);
337
338         Iterator errorsIt = errorsList.iterator();
339         while (errorsIt.hasNext()) {
340             try {
341                 LinkStatus status = (LinkStatus) errorsIt.next();
342                 //Element errorElement = doc.createElement("error");
343
//errorElement.setAttribute("url", error.getURLString());
344
Element reportRowElement = doc.createElement("ReportRow");
345                 listElement.appendChild(reportRowElement);
346
347                 Element objectElement = doc.createElement("Object");
348                 reportRowElement.appendChild(objectElement);
349                 Asset asset = status.getAsset(); // get the asset object
350

351                 Element nameElement = null;
352                 Text nameTxt = null;
353
354                 nameElement = doc.createElement("DisplayName");
355
356                 if (asset.getDisplayName() == null
357                         || asset.getDisplayName().equals("")) {
358                     nameTxt = doc.createTextNode(asset.getName());
359                 } else {
360                     nameTxt = doc.createTextNode(asset.getDisplayName());
361                 }
362
363                 nameElement.appendChild(nameTxt);
364                 objectElement.appendChild(nameElement);
365
366                 Element pathElement = doc.createElement("Path");
367                 objectElement.appendChild(pathElement);
368                 Text pathTxt = doc.createTextNode(asset.getFullPath());
369                 pathElement.appendChild(pathTxt);
370
371                 Element properiesElement = doc.createElement("Properties");
372                 reportRowElement.appendChild(properiesElement);
373
374                 Element userElement = doc.createElement("User");
375                 reportRowElement.appendChild(userElement);
376
377                 Element dateModifiedElement = doc.createElement("DateModified");
378                 reportRowElement.appendChild(dateModifiedElement);
379
380                 Element actionElement = doc.createElement("Action");
381                 reportRowElement.appendChild(actionElement);
382
383                 Element statusElement = doc.createElement("Status");
384                 reportRowElement.appendChild(statusElement);
385                 Text statusTxt = doc.createTextNode(status.getErrorMessage());
386                 statusElement.appendChild(statusTxt);
387             } catch (DOMException e) {
388                 m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
389             } catch (DataAccessException e) {
390                 m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
391             }
392         }
393
394         return doc;
395     }
396
397     /**
398      * Returns the results of the link checking activity as a List of
399      * <code>LinkError</code>s.
400      *
401      * @return unmodifiable List containing <code>LinkError</code> instances
402      * recording each error encountered. If no errors were encountered,
403      * this List will be empty.
404      * @throws IllegalStateException
405      * if the <code>checkLinks</code> method has not previously
406      * been called
407      * @see org.openharmonise.rm.resources.content.utils.LinkError
408      */

409     public List getErrorsList() {
410         if (bURLsChecked == false) {
411             throw new IllegalStateException JavaDoc("URLs have not yet been checked");
412         } else {
413             return Collections.unmodifiableList(errorsList);
414         }
415     }
416
417     // temporary hack. this will log and rethrow the proper sort of Harmonise
418
// Exception
419
private void handleException(Exception JavaDoc e) {
420         m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
421     }
422
423     /**
424      * Creates and registers an error in connecting to a URL.
425      *
426      * <p>
427      * Creates a <code>LinkError</code> and stores it in the internal errors
428      * list. Uses the <code>HttpURLConnection</code> to obtain sufficient
429      * detail to provide an error that is more meaningful than an integer HTTP
430      * status code.
431      * </p>
432      *
433      * @param url
434      * URL where the problem was encountered
435      * @param conn
436      * HttpURLConnection to the problem URL, used to get more details
437      */

438     private void createError(Asset asset, HttpURLConnection conn) {
439         LinkStatus status = null;
440         try {
441             // response message is slightly more informative than the code alone
442
status = new LinkStatus(asset, (String JavaDoc) errorCodes.get(new Integer JavaDoc(
443                     conn.getResponseCode()))
444                     + conn.getHeaderField("Location"));
445             if (conn.getResponseCode() >= 300 && conn.getResponseCode() < 308) { // i.e.
446
// a
447
// redirect
448
status.setNewURL(conn.getHeaderField("Location"));
449             }
450         } catch (IOException JavaDoc ioe) {
451             status = new LinkStatus(asset, ioe.getMessage());
452         }
453         errorsList.add(status);
454     }
455
456     /**
457      * Creates and registers an error caused by an exception thrown whilst
458      * attempting to connect to a URL.
459      *
460      * <p>
461      * Creates a <code>LinkError</code> and stores it in the internal errors
462      * list. Extracts the message from the exception to provide a meaningful
463      * error message.
464      * </p>
465      *
466      * @param url
467      * URL where the exception was encountered
468      * @param ex
469      * the exception thrown
470      */

471     private void createError(Asset asset, Exception JavaDoc ex) {
472         String JavaDoc errorMessage = ex.getMessage();
473
474         try {
475             if (ex instanceof UnknownHostException) {
476                 errorMessage = "The host " + asset.getURI() + " is unknown.";
477             }
478         } catch (DataAccessException e) {
479             errorMessage = "There was a problem trying to get the resource URI.";
480         }
481
482         LinkStatus status = new LinkStatus(asset, errorMessage);
483         errorsList.add(status);
484     }
485
486 }
487
Popular Tags